13 #include <botan/aes_ssse3.h>
14 #include <tmmintrin.h>
20 const __m128i low_nibs = _mm_set1_epi8(0x0F);
22 const __m128i k_ipt1 = _mm_set_epi32(
23 0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);
24 const __m128i k_ipt2 = _mm_set_epi32(
25 0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);
27 const __m128i k_inv1 = _mm_set_epi32(
28 0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);
29 const __m128i k_inv2 = _mm_set_epi32(
30 0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);
32 const __m128i sb1u = _mm_set_epi32(
33 0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);
34 const __m128i sb1t = _mm_set_epi32(
35 0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);
37 const __m128i mc_forward[4] = {
38 _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201),
39 _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605),
40 _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09),
41 _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)
44 const __m128i sr[4] = {
45 _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100),
46 _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500),
47 _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900),
48 _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00),
51 #define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z))
53 __m128i aes_schedule_transform(__m128i input,
57 __m128i i_1 = _mm_and_si128(low_nibs, input);
58 __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4);
60 input = _mm_and_si128(low_nibs, input);
63 _mm_shuffle_epi8(table_1, i_1),
64 _mm_shuffle_epi8(table_2, i_2));
67 __m128i aes_schedule_mangle(__m128i k,
byte round_no)
69 __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)),
74 t = _mm_shuffle_epi8(t, mc_forward[0]);
76 t2 =
mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0]));
78 return _mm_shuffle_epi8(t2, sr[round_no % 4]);
81 __m128i aes_schedule_192_smear(__m128i x, __m128i y)
84 _mm_shuffle_epi32(x, 0xFE),
85 _mm_shuffle_epi32(y, 0x80));
88 __m128i aes_schedule_mangle_dec(__m128i k,
byte round_no)
90 const __m128i dsk[8] = {
91 _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700),
92 _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300),
93 _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400),
94 _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00),
95 _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700),
96 _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700),
97 _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000),
98 _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)
101 __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]);
102 __m128i output = _mm_shuffle_epi8(t, mc_forward[0]);
104 t = aes_schedule_transform(t, dsk[2], dsk[3]);
105 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
107 t = aes_schedule_transform(t, dsk[4], dsk[5]);
108 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
110 t = aes_schedule_transform(t, dsk[6], dsk[7]);
111 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
113 return _mm_shuffle_epi8(output, sr[round_no % 4]);
116 __m128i aes_schedule_mangle_last(__m128i k,
byte round_no)
118 const __m128i out_tr1 = _mm_set_epi32(
119 0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);
120 const __m128i out_tr2 = _mm_set_epi32(
121 0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);
123 k = _mm_shuffle_epi8(k, sr[round_no % 4]);
124 k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
125 return aes_schedule_transform(k, out_tr1, out_tr2);
128 __m128i aes_schedule_mangle_last_dec(__m128i k)
130 const __m128i deskew1 = _mm_set_epi32(
131 0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300);
132 const __m128i deskew2 = _mm_set_epi32(
133 0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900);
135 k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
136 return aes_schedule_transform(k, deskew1, deskew2);
139 __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
143 input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
146 *rcon = _mm_alignr_epi8(*rcon, *rcon, 15);
148 input1 = _mm_shuffle_epi32(input1, 0xFF);
149 input1 = _mm_alignr_epi8(input1, input1, 1);
152 __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
153 smeared =
mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));
155 __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);
157 input1 = _mm_and_si128(low_nibs, input1);
159 __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);
161 input1 = _mm_xor_si128(input1, t);
163 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
164 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));
166 __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
167 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
169 return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
170 _mm_shuffle_epi8(sb1t, t6),
174 __m128i aes_ssse3_encrypt(__m128i B,
const __m128i* keys,
size_t rounds)
176 const __m128i sb2u = _mm_set_epi32(
177 0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);
178 const __m128i sb2t = _mm_set_epi32(
179 0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);
181 const __m128i sbou = _mm_set_epi32(
182 0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700);
183 const __m128i sbot = _mm_set_epi32(
184 0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00);
186 const __m128i mc_backward[4] = {
187 _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003),
188 _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F),
189 _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B),
190 _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407),
193 B =
mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
194 _mm_shuffle_epi8(k_ipt2,
196 _mm_andnot_si128(low_nibs, B),
198 _mm_loadu_si128(keys));
200 for(
size_t r = 1; ; ++
r)
202 const __m128i K = _mm_loadu_si128(keys +
r);
204 __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
206 B = _mm_and_si128(low_nibs, B);
208 __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
210 B = _mm_xor_si128(B, t);
212 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
213 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
215 __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
216 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
220 B = _mm_shuffle_epi8(
221 mm_xor3(_mm_shuffle_epi8(sbou, t5),
222 _mm_shuffle_epi8(sbot, t6),
229 __m128i t7 =
mm_xor3(_mm_shuffle_epi8(sb1t, t6),
230 _mm_shuffle_epi8(sb1u, t5),
233 __m128i t8 =
mm_xor3(_mm_shuffle_epi8(sb2t, t6),
234 _mm_shuffle_epi8(sb2u, t5),
235 _mm_shuffle_epi8(t7, mc_forward[
r % 4]));
237 B =
mm_xor3(_mm_shuffle_epi8(t8, mc_forward[
r % 4]),
238 _mm_shuffle_epi8(t7, mc_backward[r % 4]),
243 __m128i aes_ssse3_decrypt(__m128i B,
const __m128i* keys,
size_t rounds)
245 const __m128i k_dipt1 = _mm_set_epi32(
246 0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00);
247 const __m128i k_dipt2 = _mm_set_epi32(
248 0x12771772, 0xF491F194, 0x86E383E6, 0x60056500);
250 const __m128i sb9u = _mm_set_epi32(
251 0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600);
252 const __m128i sb9t = _mm_set_epi32(
253 0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900);
255 const __m128i sbeu = _mm_set_epi32(
256 0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000);
257 const __m128i sbet = _mm_set_epi32(
258 0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100);
260 const __m128i sbdu = _mm_set_epi32(
261 0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200);
262 const __m128i sbdt = _mm_set_epi32(
263 0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00);
265 const __m128i sbbu = _mm_set_epi32(
266 0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200);
267 const __m128i sbbt = _mm_set_epi32(
268 0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700);
270 __m128i mc = mc_forward[3];
273 _mm_shuffle_epi8(k_dipt2,
275 _mm_andnot_si128(low_nibs, B),
278 B =
mm_xor3(t, _mm_loadu_si128(keys),
279 _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)));
281 for(
size_t r = 1; ; ++
r)
283 const __m128i K = _mm_loadu_si128(keys +
r);
285 t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
287 B = _mm_and_si128(low_nibs, B);
289 __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
291 B = _mm_xor_si128(B, t);
293 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
294 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
295 __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
296 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
300 const __m128i sbou = _mm_set_epi32(
301 0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000);
302 const __m128i sbot = _mm_set_epi32(
303 0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00);
305 __m128i x = _mm_shuffle_epi8(sbou, t5);
306 __m128i y = _mm_shuffle_epi8(sbot, t6);
307 x = _mm_xor_si128(x, K);
308 x = _mm_xor_si128(x, y);
310 const u32bit which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
311 return _mm_shuffle_epi8(x, sr[which_sr]);
314 __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6),
315 _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));
317 __m128i t9 =
mm_xor3(_mm_shuffle_epi8(t8, mc),
318 _mm_shuffle_epi8(sbdu, t5),
319 _mm_shuffle_epi8(sbdt, t6));
321 __m128i t12 = _mm_xor_si128(
323 _mm_shuffle_epi8(t9, mc),
324 _mm_shuffle_epi8(sbbu, t5)),
325 _mm_shuffle_epi8(sbbt, t6));
327 B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc),
328 _mm_shuffle_epi8(sbeu, t5)),
329 _mm_shuffle_epi8(sbet, t6));
331 mc = _mm_alignr_epi8(mc, mc, 12);
342 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
343 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
345 const __m128i* keys =
reinterpret_cast<const __m128i*
>(&EK[0]);
347 for(
size_t i = 0; i != blocks; ++i)
349 __m128i B = _mm_loadu_si128(in_mm + i);
350 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10));
359 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
360 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
362 const __m128i* keys =
reinterpret_cast<const __m128i*
>(&DK[0]);
364 for(
size_t i = 0; i != blocks; ++i)
366 __m128i B = _mm_loadu_si128(in_mm + i);
367 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10));
374 void AES_128_SSSE3::key_schedule(
const byte keyb[],
size_t)
376 __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
377 0x1F8391B9, 0xAF9DEEB6);
379 __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
381 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(&EK[0]);
382 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(&DK[0]);
384 _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2]));
386 key = aes_schedule_transform(key, k_ipt1, k_ipt2);
388 _mm_storeu_si128(EK_mm, key);
390 for(
size_t i = 1; i != 10; ++i)
392 key = aes_schedule_round(&rcon, key, key);
394 _mm_storeu_si128(EK_mm + i,
395 aes_schedule_mangle(key, (12-i) % 4));
397 _mm_storeu_si128(DK_mm + (10-i),
398 aes_schedule_mangle_dec(key, (10-i) % 4));
401 key = aes_schedule_round(&rcon, key, key);
402 _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2));
403 _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key));
411 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
412 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
414 const __m128i* keys =
reinterpret_cast<const __m128i*
>(&EK[0]);
416 for(
size_t i = 0; i != blocks; ++i)
418 __m128i B = _mm_loadu_si128(in_mm + i);
419 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12));
428 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
429 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
431 const __m128i* keys =
reinterpret_cast<const __m128i*
>(&DK[0]);
433 for(
size_t i = 0; i != blocks; ++i)
435 __m128i B = _mm_loadu_si128(in_mm + i);
436 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12));
443 void AES_192_SSSE3::key_schedule(
const byte keyb[],
size_t)
445 __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
446 0x1F8391B9, 0xAF9DEEB6);
448 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(&EK[0]);
449 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(&DK[0]);
451 __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
452 __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 8)));
454 _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0]));
456 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
457 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
459 _mm_storeu_si128(EK_mm + 0, key1);
462 __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
464 for(
size_t i = 0; i != 4; ++i)
466 key2 = aes_schedule_round(&rcon, key2, key1);
468 _mm_storeu_si128(EK_mm + 3*i+1,
469 aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
470 _mm_storeu_si128(DK_mm + 11-3*i,
471 aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
473 t = aes_schedule_192_smear(key2, t);
475 _mm_storeu_si128(EK_mm + 3*i+2,
476 aes_schedule_mangle(t, (i+2)%4));
477 _mm_storeu_si128(DK_mm + 10-3*i,
478 aes_schedule_mangle_dec(t, (i+2)%4));
480 key2 = aes_schedule_round(&rcon, t, key2);
484 _mm_storeu_si128(EK_mm + 3*i+3,
485 aes_schedule_mangle_last(key2, (i+1)%4));
486 _mm_storeu_si128(DK_mm + 9-3*i,
487 aes_schedule_mangle_last_dec(key2));
491 _mm_storeu_si128(EK_mm + 3*i+3,
492 aes_schedule_mangle(key2, (i+1)%4));
493 _mm_storeu_si128(DK_mm + 9-3*i,
494 aes_schedule_mangle_dec(key2, (i+1)%4));
498 key2 = aes_schedule_192_smear(key2,
499 _mm_slli_si128(_mm_srli_si128(t, 8), 8));
500 t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
510 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
511 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
513 const __m128i* keys =
reinterpret_cast<const __m128i*
>(&EK[0]);
515 for(
size_t i = 0; i != blocks; ++i)
517 __m128i B = _mm_loadu_si128(in_mm + i);
518 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14));
527 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
528 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
530 const __m128i* keys =
reinterpret_cast<const __m128i*
>(&DK[0]);
532 for(
size_t i = 0; i != blocks; ++i)
534 __m128i B = _mm_loadu_si128(in_mm + i);
535 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14));
542 void AES_256_SSSE3::key_schedule(
const byte keyb[],
size_t)
544 __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
545 0x1F8391B9, 0xAF9DEEB6);
547 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(&EK[0]);
548 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(&DK[0]);
550 __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
551 __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 16)));
553 _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2]));
555 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
556 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
558 _mm_storeu_si128(EK_mm + 0, key1);
559 _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3));
561 _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1));
563 for(
size_t i = 2; i != 14; i += 2)
566 key1 = key2 = aes_schedule_round(&rcon, key2, key1);
568 _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4));
569 _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4));
571 key2 = aes_schedule_round(NULL, _mm_shuffle_epi32(key2, 0xFF), k_t);
572 _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4));
573 _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4));
576 key2 = aes_schedule_round(&rcon, key2, key1);
578 _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2));
579 _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2));
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void encrypt_n(const byte in[], byte out[], size_t blocks) const
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void encrypt_n(const byte in[], byte out[], size_t blocks) const
void encrypt_n(const byte in[], byte out[], size_t blocks) const