Print this page
first pass
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/common/crypto/aes/amd64/aes_amd64.s
+++ new/usr/src/common/crypto/aes/amd64/aes_amd64.s
1 1 /*
2 2 * ---------------------------------------------------------------------------
3 3 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
4 4 *
5 5 * LICENSE TERMS
6 6 *
7 7 * The free distribution and use of this software is allowed (with or without
8 8 * changes) provided that:
9 9 *
10 10 * 1. source code distributions include the above copyright notice, this
11 11 * list of conditions and the following disclaimer;
12 12 *
13 13 * 2. binary distributions include the above copyright notice, this list
14 14 * of conditions and the following disclaimer in their documentation;
15 15 *
16 16 * 3. the name of the copyright holder is not used to endorse products
17 17 * built using this software without specific written permission.
18 18 *
19 19 * DISCLAIMER
20 20 *
21 21 * This software is provided 'as is' with no explicit or implied warranties
22 22 * in respect of its properties, including, but not limited to, correctness
23 23 * and/or fitness for purpose.
24 24 * ---------------------------------------------------------------------------
25 25 * Issue 20/12/2007
26 26 *
27 27 * I am grateful to Dag Arne Osvik for many discussions of the techniques that
28 28 * can be used to optimise AES assembler code on AMD64/EM64T architectures.
29 29 * Some of the techniques used in this implementation are the result of
30 30 * suggestions made by him for which I am most grateful.
31 31 *
32 32 * An AES implementation for AMD64 processors using the YASM assembler. This
33 33 * implementation provides only encryption, decryption and hence requires key
34 34 * scheduling support in C. It uses 8k bytes of tables but its encryption and
35 35 * decryption performance is very close to that obtained using large tables.
36 36 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
37 37 * which are as follows:
38 38 * ms windows gnu/linux/opensolaris os
39 39 *
40 40 * in_blk rcx rdi
41 41 * out_blk rdx rsi
42 42 * context (cx) r8 rdx
43 43 *
44 44 * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15
45 45 * registers rdi - on both
46 46 *
47 47 * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11
48 48 * registers - rdi on both
49 49 *
50 50 * The convention used here is that for gnu/linux/opensolaris os.
51 51 *
52 52 * This code provides the standard AES block size (128 bits, 16 bytes) and the
53 53 * three standard AES key sizes (128, 192 and 256 bits). It has the same call
54 54 * interface as my C implementation. It uses the Microsoft C AMD64 calling
55 55 * conventions in which the three parameters are placed in rcx, rdx and r8
56 56 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
57 57 *
58 58 * OpenSolaris Note:
59 59 * Modified to use GNU/Linux/Solaris calling conventions.
60 60 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
61 61 *
62 62 * AES_RETURN aes_encrypt(const unsigned char in_blk[],
63 63 * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
64 64 *
65 65 * AES_RETURN aes_decrypt(const unsigned char in_blk[],
66 66 * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
67 67 *
68 68 * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
69 69 * const aes_encrypt_ctx cx[1])/
70 70 *
71 71 * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
72 72 * const aes_decrypt_ctx cx[1])/
73 73 *
74 74 * AES_RETURN aes_encrypt_key(const unsigned char key[],
75 75 * unsigned int len, const aes_decrypt_ctx cx[1])/
76 76 *
77 77 * AES_RETURN aes_decrypt_key(const unsigned char key[],
78 78 * unsigned int len, const aes_decrypt_ctx cx[1])/
79 79 *
80 80 * where <NNN> is 128, 102 or 256. In the last two calls the length can be in
81 81 * either bits or bytes.
82 82 *
83 83 * Comment in/out the following lines to obtain the desired subroutines. These
84 84 * selections MUST match those in the C header file aesopt.h
85 85 */
86 86 #define AES_REV_DKS /* define if key decryption schedule is reversed */
87 87
88 88 #define LAST_ROUND_TABLES /* define for the faster version using extra tables */
89 89
90 90 /*
91 91 * The encryption key schedule has the following in memory layout where N is the
92 92 * number of rounds (10, 12 or 14):
93 93 *
94 94 * lo: | input key (round 0) | / each round is four 32-bit words
95 95 * | encryption round 1 |
96 96 * | encryption round 2 |
97 97 * ....
98 98 * | encryption round N-1 |
99 99 * hi: | encryption round N |
100 100 *
101 101 * The decryption key schedule is normally set up so that it has the same
102 102 * layout as above by actually reversing the order of the encryption key
103 103 * schedule in memory (this happens when AES_REV_DKS is set):
104 104 *
105 105 * lo: | decryption round 0 | = | encryption round N |
106 106 * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
107 107 * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
108 108 * .... ....
109 109 * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
110 110 * hi: | decryption round N | = | input key (round 0) |
111 111 *
112 112 * with rounds except the first and last modified using inv_mix_column()
113 113 * But if AES_REV_DKS is NOT set the order of keys is left as it is for
114 114 * encryption so that it has to be accessed in reverse when used for
115 115 * decryption (although the inverse mix column modifications are done)
116 116 *
117 117 * lo: | decryption round 0 | = | input key (round 0) |
118 118 * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
119 119 * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
120 120 * .... ....
121 121 * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
122 122 * hi: | decryption round N | = | encryption round N |
123 123 *
124 124 * This layout is faster when the assembler key scheduling provided here
125 125 * is used.
126 126 *
127 127 * End of user defines
128 128 */
129 129
130 130 /*
131 131 * ---------------------------------------------------------------------------
132 132 * OpenSolaris OS modifications
133 133 *
134 134 * This source originates from Brian Gladman file aes_amd64.asm
135 135 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
136 136 * with these changes:
137 137 *
138 138 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
139 139 * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION,
140 140 * AES_128, AES_192, AES_256, AES_VAR ifdefs.
141 141 *
142 142 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
143 143 *
144 144 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
145 145 *
146 146 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
147 147 * (operands reversed, literals prefixed with "$", registers prefixed with "%",
148 148 * and "[register+offset]", addressing changed to "offset(register)",
↓ open down ↓ |
148 lines elided |
↑ open up ↑ |
149 149 * parenthesis in constant expressions "()" changed to square brackets "[]",
150 150 * "." removed from local (numeric) labels, and other changes.
151 151 * Examples:
152 152 * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax
153 153 * mov rax,(4*20h) mov $[4*0x20],%rax
154 154 * mov rax,[ebx+20h] mov 0x20(%ebx),%rax
155 155 * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax
156 156 * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax
157 157 *
158 158 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
159 - * /usr/include/sys/asm_linkage.h, lint(1B) guards, EXPORT DELETE START
160 - * and EXPORT DELETE END markers, and dummy C function definitions for lint.
159 + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
160 + * definitions for lint.
161 161 *
162 162 * 6. Renamed functions and reordered parameters to match OpenSolaris:
163 163 * Original Gladman interface:
164 164 * int aes_encrypt(const unsigned char *in,
165 165 * unsigned char *out, const aes_encrypt_ctx cx[1])/
166 166 * int aes_decrypt(const unsigned char *in,
167 167 * unsigned char *out, const aes_encrypt_ctx cx[1])/
168 168 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
169 169 * and a union type, inf., containing inf.l, a uint32_t and
170 170 * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is
171 171 * used and contains the key schedule length * 16 where key schedule length is
172 172 * 10, 12, or 14 bytes.
173 173 *
174 174 * OpenSolaris OS interface:
175 175 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
176 176 * const uint32_t pt[4], uint32_t ct[4])/
177 177 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
178 178 * const uint32_t pt[4], uint32_t ct[4])/
179 179 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
180 180 * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
181 181 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
182 182 * ct is crypto text, and MAX_AES_NR is 14.
183 183 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
184 184 */
185 185
186 186 #if defined(lint) || defined(__lint)
187 187
188 188 #include <sys/types.h>
189 189 /* ARGSUSED */
190 190 void
191 191 aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
192 192 uint32_t ct[4]) {
193 193 }
194 194 /* ARGSUSED */
195 195 void
196 196 aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
197 197 uint32_t pt[4]) {
198 198 }
199 199
200 200
201 201 #else
202 202
203 203 #include <sys/asm_linkage.h>
204 204
205 205 #define KS_LENGTH 60
206 206
207 207 #define raxd eax
208 208 #define rdxd edx
209 209 #define rcxd ecx
210 210 #define rbxd ebx
211 211 #define rsid esi
212 212 #define rdid edi
213 213
214 214 #define raxb al
215 215 #define rdxb dl
216 216 #define rcxb cl
217 217 #define rbxb bl
218 218 #define rsib sil
219 219 #define rdib dil
220 220
221 221 / finite field multiplies by {02}, {04} and {08}
222 222
223 223 #define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
224 224 #define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
225 225 #define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
226 226
227 227 / finite field multiplies required in table generation
228 228
229 229 #define f3(x) [[f2(x)] ^ [x]]
230 230 #define f9(x) [[f8(x)] ^ [x]]
231 231 #define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
232 232 #define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
233 233 #define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
234 234
235 235 / macros for expanding S-box data
236 236
237 237 #define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
238 238 #define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
239 239 #define w8(x) [x], 0, 0, 0, [x], 0, 0, 0
240 240
241 241 #define enc_vals(x) \
242 242 .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
243 243 .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
244 244 .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
245 245 .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
246 246 .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
247 247 .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
248 248 .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
249 249 .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
250 250 .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
251 251 .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
252 252 .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
253 253 .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
254 254 .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
255 255 .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
256 256 .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
257 257 .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
258 258 .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
259 259 .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
260 260 .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
261 261 .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
262 262 .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
263 263 .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
264 264 .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
265 265 .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
266 266 .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
267 267 .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
268 268 .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
269 269 .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
270 270 .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
271 271 .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
272 272 .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
273 273 .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
274 274
275 275 #define dec_vals(x) \
276 276 .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
277 277 .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
278 278 .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
279 279 .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
280 280 .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
281 281 .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
282 282 .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
283 283 .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
284 284 .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
285 285 .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
286 286 .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
287 287 .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
288 288 .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
289 289 .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
290 290 .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
291 291 .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
292 292 .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
293 293 .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
294 294 .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
295 295 .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
296 296 .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
297 297 .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
298 298 .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
299 299 .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
300 300 .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
301 301 .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
302 302 .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
303 303 .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
304 304 .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
305 305 .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
306 306 .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
307 307 .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
308 308
309 309 #define tptr %rbp /* table pointer */
310 310 #define kptr %r8 /* key schedule pointer */
311 311 #define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */
312 312 #define fk_ref(x, y) -16*x+fofs+4*y(kptr)
313 313
314 314 #ifdef AES_REV_DKS
315 315 #define rofs 128
316 316 #define ik_ref(x, y) -16*x+rofs+4*y(kptr)
317 317
318 318 #else
319 319 #define rofs -128
↓ open down ↓ |
149 lines elided |
↑ open up ↑ |
320 320 #define ik_ref(x, y) 16*x+rofs+4*y(kptr)
321 321 #endif /* AES_REV_DKS */
322 322
323 323 #define tab_0(x) (tptr,x,8)
324 324 #define tab_1(x) 3(tptr,x,8)
325 325 #define tab_2(x) 2(tptr,x,8)
326 326 #define tab_3(x) 1(tptr,x,8)
327 327 #define tab_f(x) 1(tptr,x,8)
328 328 #define tab_i(x) 7(tptr,x,8)
329 329
330 - /* EXPORT DELETE START */
331 330 #define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \
332 331 mov fk_ref(round,0), p1; \
333 332 mov fk_ref(round,1), p2; \
334 333 mov fk_ref(round,2), p3; \
335 334 mov fk_ref(round,3), p4; \
336 335 \
337 336 movzx %al, %esi; \
338 337 movzx %ah, %edi; \
339 338 shr $16, %eax; \
340 339 xor tab_0(%rsi), p1; \
341 340 xor tab_1(%rdi), p4; \
342 341 movzx %al, %esi; \
343 342 movzx %ah, %edi; \
344 343 xor tab_2(%rsi), p3; \
345 344 xor tab_3(%rdi), p2; \
346 345 \
347 346 movzx %bl, %esi; \
348 347 movzx %bh, %edi; \
349 348 shr $16, %ebx; \
350 349 xor tab_0(%rsi), p2; \
351 350 xor tab_1(%rdi), p1; \
352 351 movzx %bl, %esi; \
353 352 movzx %bh, %edi; \
354 353 xor tab_2(%rsi), p4; \
355 354 xor tab_3(%rdi), p3; \
356 355 \
357 356 movzx %cl, %esi; \
358 357 movzx %ch, %edi; \
359 358 shr $16, %ecx; \
360 359 xor tab_0(%rsi), p3; \
361 360 xor tab_1(%rdi), p2; \
362 361 movzx %cl, %esi; \
363 362 movzx %ch, %edi; \
364 363 xor tab_2(%rsi), p1; \
365 364 xor tab_3(%rdi), p4; \
366 365 \
367 366 movzx %dl, %esi; \
368 367 movzx %dh, %edi; \
369 368 shr $16, %edx; \
370 369 xor tab_0(%rsi), p4; \
371 370 xor tab_1(%rdi), p3; \
372 371 movzx %dl, %esi; \
373 372 movzx %dh, %edi; \
374 373 xor tab_2(%rsi), p2; \
375 374 xor tab_3(%rdi), p1; \
376 375 \
377 376 mov p1, %eax; \
378 377 mov p2, %ebx; \
379 378 mov p3, %ecx; \
380 379 mov p4, %edx
381 380
382 381 #ifdef LAST_ROUND_TABLES
383 382
384 383 #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
385 384 add $2048, tptr; \
386 385 mov fk_ref(round,0), p1; \
387 386 mov fk_ref(round,1), p2; \
388 387 mov fk_ref(round,2), p3; \
389 388 mov fk_ref(round,3), p4; \
390 389 \
391 390 movzx %al, %esi; \
392 391 movzx %ah, %edi; \
393 392 shr $16, %eax; \
394 393 xor tab_0(%rsi), p1; \
395 394 xor tab_1(%rdi), p4; \
396 395 movzx %al, %esi; \
397 396 movzx %ah, %edi; \
398 397 xor tab_2(%rsi), p3; \
399 398 xor tab_3(%rdi), p2; \
400 399 \
401 400 movzx %bl, %esi; \
402 401 movzx %bh, %edi; \
403 402 shr $16, %ebx; \
404 403 xor tab_0(%rsi), p2; \
405 404 xor tab_1(%rdi), p1; \
406 405 movzx %bl, %esi; \
407 406 movzx %bh, %edi; \
408 407 xor tab_2(%rsi), p4; \
409 408 xor tab_3(%rdi), p3; \
410 409 \
411 410 movzx %cl, %esi; \
412 411 movzx %ch, %edi; \
413 412 shr $16, %ecx; \
414 413 xor tab_0(%rsi), p3; \
415 414 xor tab_1(%rdi), p2; \
416 415 movzx %cl, %esi; \
417 416 movzx %ch, %edi; \
418 417 xor tab_2(%rsi), p1; \
419 418 xor tab_3(%rdi), p4; \
420 419 \
421 420 movzx %dl, %esi; \
422 421 movzx %dh, %edi; \
423 422 shr $16, %edx; \
424 423 xor tab_0(%rsi), p4; \
425 424 xor tab_1(%rdi), p3; \
426 425 movzx %dl, %esi; \
427 426 movzx %dh, %edi; \
428 427 xor tab_2(%rsi), p2; \
429 428 xor tab_3(%rdi), p1
430 429
431 430 #else
432 431
433 432 #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
434 433 mov fk_ref(round,0), p1; \
435 434 mov fk_ref(round,1), p2; \
436 435 mov fk_ref(round,2), p3; \
437 436 mov fk_ref(round,3), p4; \
438 437 \
439 438 movzx %al, %esi; \
440 439 movzx %ah, %edi; \
441 440 shr $16, %eax; \
442 441 movzx tab_f(%rsi), %esi; \
443 442 movzx tab_f(%rdi), %edi; \
444 443 xor %esi, p1; \
445 444 rol $8, %edi; \
446 445 xor %edi, p4; \
447 446 movzx %al, %esi; \
448 447 movzx %ah, %edi; \
449 448 movzx tab_f(%rsi), %esi; \
450 449 movzx tab_f(%rdi), %edi; \
451 450 rol $16, %esi; \
452 451 rol $24, %edi; \
453 452 xor %esi, p3; \
454 453 xor %edi, p2; \
455 454 \
456 455 movzx %bl, %esi; \
457 456 movzx %bh, %edi; \
458 457 shr $16, %ebx; \
459 458 movzx tab_f(%rsi), %esi; \
460 459 movzx tab_f(%rdi), %edi; \
461 460 xor %esi, p2; \
462 461 rol $8, %edi; \
463 462 xor %edi, p1; \
464 463 movzx %bl, %esi; \
465 464 movzx %bh, %edi; \
466 465 movzx tab_f(%rsi), %esi; \
467 466 movzx tab_f(%rdi), %edi; \
468 467 rol $16, %esi; \
469 468 rol $24, %edi; \
470 469 xor %esi, p4; \
471 470 xor %edi, p3; \
472 471 \
473 472 movzx %cl, %esi; \
474 473 movzx %ch, %edi; \
475 474 movzx tab_f(%rsi), %esi; \
476 475 movzx tab_f(%rdi), %edi; \
477 476 shr $16, %ecx; \
478 477 xor %esi, p3; \
479 478 rol $8, %edi; \
480 479 xor %edi, p2; \
481 480 movzx %cl, %esi; \
482 481 movzx %ch, %edi; \
483 482 movzx tab_f(%rsi), %esi; \
484 483 movzx tab_f(%rdi), %edi; \
485 484 rol $16, %esi; \
486 485 rol $24, %edi; \
487 486 xor %esi, p1; \
488 487 xor %edi, p4; \
489 488 \
490 489 movzx %dl, %esi; \
491 490 movzx %dh, %edi; \
492 491 movzx tab_f(%rsi), %esi; \
493 492 movzx tab_f(%rdi), %edi; \
494 493 shr $16, %edx; \
495 494 xor %esi, p4; \
496 495 rol $8, %edi; \
497 496 xor %edi, p3; \
498 497 movzx %dl, %esi; \
499 498 movzx %dh, %edi; \
500 499 movzx tab_f(%rsi), %esi; \
501 500 movzx tab_f(%rdi), %edi; \
502 501 rol $16, %esi; \
503 502 rol $24, %edi; \
504 503 xor %esi, p2; \
505 504 xor %edi, p1
506 505
507 506 #endif /* LAST_ROUND_TABLES */
508 507
509 508 #define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \
510 509 mov ik_ref(round,0), p1; \
511 510 mov ik_ref(round,1), p2; \
512 511 mov ik_ref(round,2), p3; \
513 512 mov ik_ref(round,3), p4; \
514 513 \
515 514 movzx %al, %esi; \
516 515 movzx %ah, %edi; \
517 516 shr $16, %eax; \
518 517 xor tab_0(%rsi), p1; \
519 518 xor tab_1(%rdi), p2; \
520 519 movzx %al, %esi; \
521 520 movzx %ah, %edi; \
522 521 xor tab_2(%rsi), p3; \
523 522 xor tab_3(%rdi), p4; \
524 523 \
525 524 movzx %bl, %esi; \
526 525 movzx %bh, %edi; \
527 526 shr $16, %ebx; \
528 527 xor tab_0(%rsi), p2; \
529 528 xor tab_1(%rdi), p3; \
530 529 movzx %bl, %esi; \
531 530 movzx %bh, %edi; \
532 531 xor tab_2(%rsi), p4; \
533 532 xor tab_3(%rdi), p1; \
534 533 \
535 534 movzx %cl, %esi; \
536 535 movzx %ch, %edi; \
537 536 shr $16, %ecx; \
538 537 xor tab_0(%rsi), p3; \
539 538 xor tab_1(%rdi), p4; \
540 539 movzx %cl, %esi; \
541 540 movzx %ch, %edi; \
542 541 xor tab_2(%rsi), p1; \
543 542 xor tab_3(%rdi), p2; \
544 543 \
545 544 movzx %dl, %esi; \
546 545 movzx %dh, %edi; \
547 546 shr $16, %edx; \
548 547 xor tab_0(%rsi), p4; \
549 548 xor tab_1(%rdi), p1; \
550 549 movzx %dl, %esi; \
551 550 movzx %dh, %edi; \
552 551 xor tab_2(%rsi), p2; \
553 552 xor tab_3(%rdi), p3; \
554 553 \
555 554 mov p1, %eax; \
556 555 mov p2, %ebx; \
557 556 mov p3, %ecx; \
558 557 mov p4, %edx
559 558
560 559 #ifdef LAST_ROUND_TABLES
561 560
562 561 #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
563 562 add $2048, tptr; \
564 563 mov ik_ref(round,0), p1; \
565 564 mov ik_ref(round,1), p2; \
566 565 mov ik_ref(round,2), p3; \
567 566 mov ik_ref(round,3), p4; \
568 567 \
569 568 movzx %al, %esi; \
570 569 movzx %ah, %edi; \
571 570 shr $16, %eax; \
572 571 xor tab_0(%rsi), p1; \
573 572 xor tab_1(%rdi), p2; \
574 573 movzx %al, %esi; \
575 574 movzx %ah, %edi; \
576 575 xor tab_2(%rsi), p3; \
577 576 xor tab_3(%rdi), p4; \
578 577 \
579 578 movzx %bl, %esi; \
580 579 movzx %bh, %edi; \
581 580 shr $16, %ebx; \
582 581 xor tab_0(%rsi), p2; \
583 582 xor tab_1(%rdi), p3; \
584 583 movzx %bl, %esi; \
585 584 movzx %bh, %edi; \
586 585 xor tab_2(%rsi), p4; \
587 586 xor tab_3(%rdi), p1; \
588 587 \
589 588 movzx %cl, %esi; \
590 589 movzx %ch, %edi; \
591 590 shr $16, %ecx; \
592 591 xor tab_0(%rsi), p3; \
593 592 xor tab_1(%rdi), p4; \
594 593 movzx %cl, %esi; \
595 594 movzx %ch, %edi; \
596 595 xor tab_2(%rsi), p1; \
597 596 xor tab_3(%rdi), p2; \
598 597 \
599 598 movzx %dl, %esi; \
600 599 movzx %dh, %edi; \
601 600 shr $16, %edx; \
602 601 xor tab_0(%rsi), p4; \
603 602 xor tab_1(%rdi), p1; \
604 603 movzx %dl, %esi; \
605 604 movzx %dh, %edi; \
606 605 xor tab_2(%rsi), p2; \
607 606 xor tab_3(%rdi), p3
608 607
609 608 #else
610 609
611 610 #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
612 611 mov ik_ref(round,0), p1; \
613 612 mov ik_ref(round,1), p2; \
614 613 mov ik_ref(round,2), p3; \
615 614 mov ik_ref(round,3), p4; \
616 615 \
617 616 movzx %al, %esi; \
618 617 movzx %ah, %edi; \
619 618 movzx tab_i(%rsi), %esi; \
620 619 movzx tab_i(%rdi), %edi; \
621 620 shr $16, %eax; \
622 621 xor %esi, p1; \
623 622 rol $8, %edi; \
624 623 xor %edi, p2; \
625 624 movzx %al, %esi; \
626 625 movzx %ah, %edi; \
627 626 movzx tab_i(%rsi), %esi; \
628 627 movzx tab_i(%rdi), %edi; \
629 628 rol $16, %esi; \
630 629 rol $24, %edi; \
631 630 xor %esi, p3; \
632 631 xor %edi, p4; \
633 632 \
634 633 movzx %bl, %esi; \
635 634 movzx %bh, %edi; \
636 635 movzx tab_i(%rsi), %esi; \
637 636 movzx tab_i(%rdi), %edi; \
638 637 shr $16, %ebx; \
639 638 xor %esi, p2; \
640 639 rol $8, %edi; \
641 640 xor %edi, p3; \
642 641 movzx %bl, %esi; \
643 642 movzx %bh, %edi; \
644 643 movzx tab_i(%rsi), %esi; \
645 644 movzx tab_i(%rdi), %edi; \
646 645 rol $16, %esi; \
647 646 rol $24, %edi; \
648 647 xor %esi, p4; \
649 648 xor %edi, p1; \
650 649 \
651 650 movzx %cl, %esi; \
652 651 movzx %ch, %edi; \
653 652 movzx tab_i(%rsi), %esi; \
654 653 movzx tab_i(%rdi), %edi; \
655 654 shr $16, %ecx; \
656 655 xor %esi, p3; \
657 656 rol $8, %edi; \
658 657 xor %edi, p4; \
659 658 movzx %cl, %esi; \
660 659 movzx %ch, %edi; \
661 660 movzx tab_i(%rsi), %esi; \
662 661 movzx tab_i(%rdi), %edi; \
663 662 rol $16, %esi; \
664 663 rol $24, %edi; \
665 664 xor %esi, p1; \
666 665 xor %edi, p2; \
667 666 \
668 667 movzx %dl, %esi; \
669 668 movzx %dh, %edi; \
670 669 movzx tab_i(%rsi), %esi; \
671 670 movzx tab_i(%rdi), %edi; \
672 671 shr $16, %edx; \
673 672 xor %esi, p4; \
674 673 rol $8, %edi; \
675 674 xor %edi, p1; \
↓ open down ↓ |
335 lines elided |
↑ open up ↑ |
676 675 movzx %dl, %esi; \
677 676 movzx %dh, %edi; \
678 677 movzx tab_i(%rsi), %esi; \
679 678 movzx tab_i(%rdi), %edi; \
680 679 rol $16, %esi; \
681 680 rol $24, %edi; \
682 681 xor %esi, p2; \
683 682 xor %edi, p3
684 683
685 684 #endif /* LAST_ROUND_TABLES */
686 - /* EXPORT DELETE END */
687 685
688 686 /*
689 687 * OpenSolaris OS:
690 688 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
691 689 * const uint32_t pt[4], uint32_t ct[4])/
692 690 *
693 691 * Original interface:
694 692 * int aes_encrypt(const unsigned char *in,
695 693 * unsigned char *out, const aes_encrypt_ctx cx[1])/
696 694 */
697 695 .align 64
698 696 enc_tab:
699 697 enc_vals(u8)
700 698 #ifdef LAST_ROUND_TABLES
701 699 / Last Round Tables:
702 700 enc_vals(w8)
703 701 #endif
704 702
705 703
706 704 ENTRY_NP(aes_encrypt_amd64)
707 - /* EXPORT DELETE START */
708 705 #ifdef GLADMAN_INTERFACE
709 706 / Original interface
710 707 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface
711 708 mov %rsi, (%rsp) / output pointer (P2)
712 709 mov %rdx, %r8 / context (P3)
713 710
714 711 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi
715 712 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp)
716 713 mov %r12, 3*8(%rsp) / P3: context in r8
717 714 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16
718 715
719 716 #else
720 717 / OpenSolaris OS interface
721 718 sub $[4*8], %rsp / Make room on stack to save registers
722 719 mov %rcx, (%rsp) / Save output pointer (P4) on stack
723 720 mov %rdi, %r8 / context (P1)
724 721 mov %rdx, %rdi / P3: save input pointer
725 722 shl $4, %esi / P2: esi byte key length * 16
726 723
727 724 mov %rbx, 1*8(%rsp) / Save registers
728 725 mov %rbp, 2*8(%rsp)
729 726 mov %r12, 3*8(%rsp)
730 727 / P1: context in r8
731 728 / P2: byte key length * 16 in esi
732 729 / P3: input pointer in rdi
733 730 / P4: output pointer in (rsp)
734 731 #endif /* GLADMAN_INTERFACE */
735 732
736 733 lea enc_tab(%rip), tptr
737 734 sub $fofs, kptr
738 735
739 736 / Load input block into registers
740 737 mov (%rdi), %eax
741 738 mov 1*4(%rdi), %ebx
742 739 mov 2*4(%rdi), %ecx
743 740 mov 3*4(%rdi), %edx
744 741
745 742 xor fofs(kptr), %eax
746 743 xor fofs+4(kptr), %ebx
747 744 xor fofs+8(kptr), %ecx
748 745 xor fofs+12(kptr), %edx
749 746
750 747 lea (kptr,%rsi), kptr
751 748 / Jump based on byte key length * 16:
752 749 cmp $[10*16], %esi
753 750 je 3f
754 751 cmp $[12*16], %esi
755 752 je 2f
756 753 cmp $[14*16], %esi
757 754 je 1f
758 755 mov $-1, %rax / error
759 756 jmp 4f
760 757
761 758 / Perform normal forward rounds
762 759 1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
763 760 ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
764 761 2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
765 762 ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
766 763 3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9)
767 764 ff_rnd(%r9d, %r10d, %r11d, %r12d, 8)
768 765 ff_rnd(%r9d, %r10d, %r11d, %r12d, 7)
769 766 ff_rnd(%r9d, %r10d, %r11d, %r12d, 6)
770 767 ff_rnd(%r9d, %r10d, %r11d, %r12d, 5)
771 768 ff_rnd(%r9d, %r10d, %r11d, %r12d, 4)
772 769 ff_rnd(%r9d, %r10d, %r11d, %r12d, 3)
773 770 ff_rnd(%r9d, %r10d, %r11d, %r12d, 2)
774 771 ff_rnd(%r9d, %r10d, %r11d, %r12d, 1)
775 772 fl_rnd(%r9d, %r10d, %r11d, %r12d, 0)
776 773
777 774 / Copy results
778 775 mov (%rsp), %rbx
↓ open down ↓ |
61 lines elided |
↑ open up ↑ |
779 776 mov %r9d, (%rbx)
780 777 mov %r10d, 4(%rbx)
781 778 mov %r11d, 8(%rbx)
782 779 mov %r12d, 12(%rbx)
783 780 xor %rax, %rax
784 781 4: / Restore registers
785 782 mov 1*8(%rsp), %rbx
786 783 mov 2*8(%rsp), %rbp
787 784 mov 3*8(%rsp), %r12
788 785 add $[4*8], %rsp
789 - /* EXPORT DELETE END */
790 786 ret
791 787
792 788 SET_SIZE(aes_encrypt_amd64)
793 789
794 790 /*
795 791 * OpenSolaris OS:
796 792 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
797 793 * const uint32_t pt[4], uint32_t ct[4])/
798 794 *
799 795 * Original interface:
800 796 * int aes_decrypt(const unsigned char *in,
801 797 * unsigned char *out, const aes_encrypt_ctx cx[1])/
802 798 */
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
803 799 .align 64
804 800 dec_tab:
805 801 dec_vals(v8)
806 802 #ifdef LAST_ROUND_TABLES
807 803 / Last Round Tables:
808 804 dec_vals(w8)
809 805 #endif
810 806
811 807
812 808 ENTRY_NP(aes_decrypt_amd64)
813 - /* EXPORT DELETE START */
814 809 #ifdef GLADMAN_INTERFACE
815 810 / Original interface
816 811 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface
817 812 mov %rsi, (%rsp) / output pointer (P2)
818 813 mov %rdx, %r8 / context (P3)
819 814
820 815 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi
821 816 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp)
822 817 mov %r12, 3*8(%rsp) / P3: context in r8
823 818 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16
824 819
825 820 #else
826 821 / OpenSolaris OS interface
827 822 sub $[4*8], %rsp / Make room on stack to save registers
828 823 mov %rcx, (%rsp) / Save output pointer (P4) on stack
829 824 mov %rdi, %r8 / context (P1)
830 825 mov %rdx, %rdi / P3: save input pointer
831 826 shl $4, %esi / P2: esi byte key length * 16
832 827
833 828 mov %rbx, 1*8(%rsp) / Save registers
834 829 mov %rbp, 2*8(%rsp)
835 830 mov %r12, 3*8(%rsp)
836 831 / P1: context in r8
837 832 / P2: byte key length * 16 in esi
838 833 / P3: input pointer in rdi
839 834 / P4: output pointer in (rsp)
840 835 #endif /* GLADMAN_INTERFACE */
841 836
842 837 lea dec_tab(%rip), tptr
843 838 sub $rofs, kptr
844 839
845 840 / Load input block into registers
846 841 mov (%rdi), %eax
847 842 mov 1*4(%rdi), %ebx
848 843 mov 2*4(%rdi), %ecx
849 844 mov 3*4(%rdi), %edx
850 845
851 846 #ifdef AES_REV_DKS
852 847 mov kptr, %rdi
853 848 lea (kptr,%rsi), kptr
854 849 #else
855 850 lea (kptr,%rsi), %rdi
856 851 #endif
857 852
858 853 xor rofs(%rdi), %eax
859 854 xor rofs+4(%rdi), %ebx
860 855 xor rofs+8(%rdi), %ecx
861 856 xor rofs+12(%rdi), %edx
862 857
863 858 / Jump based on byte key length * 16:
864 859 cmp $[10*16], %esi
865 860 je 3f
866 861 cmp $[12*16], %esi
867 862 je 2f
868 863 cmp $[14*16], %esi
869 864 je 1f
870 865 mov $-1, %rax / error
871 866 jmp 4f
872 867
873 868 / Perform normal inverse rounds
874 869 1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
875 870 ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
876 871 2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
877 872 ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
878 873 3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9)
879 874 ii_rnd(%r9d, %r10d, %r11d, %r12d, 8)
880 875 ii_rnd(%r9d, %r10d, %r11d, %r12d, 7)
881 876 ii_rnd(%r9d, %r10d, %r11d, %r12d, 6)
882 877 ii_rnd(%r9d, %r10d, %r11d, %r12d, 5)
883 878 ii_rnd(%r9d, %r10d, %r11d, %r12d, 4)
884 879 ii_rnd(%r9d, %r10d, %r11d, %r12d, 3)
885 880 ii_rnd(%r9d, %r10d, %r11d, %r12d, 2)
886 881 ii_rnd(%r9d, %r10d, %r11d, %r12d, 1)
887 882 il_rnd(%r9d, %r10d, %r11d, %r12d, 0)
888 883
889 884 / Copy results
890 885 mov (%rsp), %rbx
↓ open down ↓ |
67 lines elided |
↑ open up ↑ |
891 886 mov %r9d, (%rbx)
892 887 mov %r10d, 4(%rbx)
893 888 mov %r11d, 8(%rbx)
894 889 mov %r12d, 12(%rbx)
895 890 xor %rax, %rax
896 891 4: / Restore registers
897 892 mov 1*8(%rsp), %rbx
898 893 mov 2*8(%rsp), %rbp
899 894 mov 3*8(%rsp), %r12
900 895 add $[4*8], %rsp
901 - /* EXPORT DELETE END */
902 896 ret
903 897
904 898 SET_SIZE(aes_decrypt_amd64)
905 899 #endif /* lint || __lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX