Botan  1.10.9
aes_ni.cpp
Go to the documentation of this file.
1 /*
2 * AES using AES-NI instructions
3 * (C) 2009 Jack Lloyd
4 *
5 * Distributed under the terms of the Botan license
6 */
7 
8 #include <botan/aes_ni.h>
9 #include <botan/loadstor.h>
10 #include <wmmintrin.h>
11 
12 namespace Botan {
13 
14 namespace {
15 
16 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
17  {
18  key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
19  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
20  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22  return _mm_xor_si128(key, key_with_rcon);
23  }
24 
25 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
26  u32bit out[], bool last)
27  {
28  __m128i key1 = *K1;
29  __m128i key2 = *K2;
30 
31  key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
32  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
33  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
34  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35  key1 = _mm_xor_si128(key1, key2_with_rcon);
36 
37  *K1 = key1;
38  _mm_storeu_si128((__m128i*)out, key1);
39 
40  if(last)
41  return;
42 
43  key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
44  key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
45 
46  *K2 = key2;
47  out[4] = _mm_cvtsi128_si32(key2);
48  out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
49  }
50 
51 /*
52 * The second half of the AES-256 key expansion (other half same as AES-128)
53 */
54 __m128i aes_256_key_expansion(__m128i key, __m128i key2)
55  {
56  __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
57  key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
58 
59  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
60  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
61  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
62  return _mm_xor_si128(key, key_with_rcon);
63  }
64 
65 }
66 
67 #define AES_ENC_4_ROUNDS(K) \
68  do \
69  { \
70  B0 = _mm_aesenc_si128(B0, K); \
71  B1 = _mm_aesenc_si128(B1, K); \
72  B2 = _mm_aesenc_si128(B2, K); \
73  B3 = _mm_aesenc_si128(B3, K); \
74  } while(0)
75 
76 #define AES_ENC_4_LAST_ROUNDS(K) \
77  do \
78  { \
79  B0 = _mm_aesenclast_si128(B0, K); \
80  B1 = _mm_aesenclast_si128(B1, K); \
81  B2 = _mm_aesenclast_si128(B2, K); \
82  B3 = _mm_aesenclast_si128(B3, K); \
83  } while(0)
84 
85 #define AES_DEC_4_ROUNDS(K) \
86  do \
87  { \
88  B0 = _mm_aesdec_si128(B0, K); \
89  B1 = _mm_aesdec_si128(B1, K); \
90  B2 = _mm_aesdec_si128(B2, K); \
91  B3 = _mm_aesdec_si128(B3, K); \
92  } while(0)
93 
94 #define AES_DEC_4_LAST_ROUNDS(K) \
95  do \
96  { \
97  B0 = _mm_aesdeclast_si128(B0, K); \
98  B1 = _mm_aesdeclast_si128(B1, K); \
99  B2 = _mm_aesdeclast_si128(B2, K); \
100  B3 = _mm_aesdeclast_si128(B3, K); \
101  } while(0)
102 
103 /*
104 * AES-128 Encryption
105 */
106 void AES_128_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const
107  {
108  const __m128i* in_mm = (const __m128i*)in;
109  __m128i* out_mm = (__m128i*)out;
110 
111  const __m128i* key_mm = (const __m128i*)&EK[0];
112 
113  __m128i K0 = _mm_loadu_si128(key_mm);
114  __m128i K1 = _mm_loadu_si128(key_mm + 1);
115  __m128i K2 = _mm_loadu_si128(key_mm + 2);
116  __m128i K3 = _mm_loadu_si128(key_mm + 3);
117  __m128i K4 = _mm_loadu_si128(key_mm + 4);
118  __m128i K5 = _mm_loadu_si128(key_mm + 5);
119  __m128i K6 = _mm_loadu_si128(key_mm + 6);
120  __m128i K7 = _mm_loadu_si128(key_mm + 7);
121  __m128i K8 = _mm_loadu_si128(key_mm + 8);
122  __m128i K9 = _mm_loadu_si128(key_mm + 9);
123  __m128i K10 = _mm_loadu_si128(key_mm + 10);
124 
125  while(blocks >= 4)
126  {
127  __m128i B0 = _mm_loadu_si128(in_mm + 0);
128  __m128i B1 = _mm_loadu_si128(in_mm + 1);
129  __m128i B2 = _mm_loadu_si128(in_mm + 2);
130  __m128i B3 = _mm_loadu_si128(in_mm + 3);
131 
132  B0 = _mm_xor_si128(B0, K0);
133  B1 = _mm_xor_si128(B1, K0);
134  B2 = _mm_xor_si128(B2, K0);
135  B3 = _mm_xor_si128(B3, K0);
136 
137  AES_ENC_4_ROUNDS(K1);
138  AES_ENC_4_ROUNDS(K2);
139  AES_ENC_4_ROUNDS(K3);
140  AES_ENC_4_ROUNDS(K4);
141  AES_ENC_4_ROUNDS(K5);
142  AES_ENC_4_ROUNDS(K6);
143  AES_ENC_4_ROUNDS(K7);
144  AES_ENC_4_ROUNDS(K8);
145  AES_ENC_4_ROUNDS(K9);
147 
148  _mm_storeu_si128(out_mm + 0, B0);
149  _mm_storeu_si128(out_mm + 1, B1);
150  _mm_storeu_si128(out_mm + 2, B2);
151  _mm_storeu_si128(out_mm + 3, B3);
152 
153  blocks -= 4;
154  in_mm += 4;
155  out_mm += 4;
156  }
157 
158  for(size_t i = 0; i != blocks; ++i)
159  {
160  __m128i B = _mm_loadu_si128(in_mm + i);
161 
162  B = _mm_xor_si128(B, K0);
163 
164  B = _mm_aesenc_si128(B, K1);
165  B = _mm_aesenc_si128(B, K2);
166  B = _mm_aesenc_si128(B, K3);
167  B = _mm_aesenc_si128(B, K4);
168  B = _mm_aesenc_si128(B, K5);
169  B = _mm_aesenc_si128(B, K6);
170  B = _mm_aesenc_si128(B, K7);
171  B = _mm_aesenc_si128(B, K8);
172  B = _mm_aesenc_si128(B, K9);
173  B = _mm_aesenclast_si128(B, K10);
174 
175  _mm_storeu_si128(out_mm + i, B);
176  }
177  }
178 
179 /*
180 * AES-128 Decryption
181 */
182 void AES_128_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const
183  {
184  const __m128i* in_mm = (const __m128i*)in;
185  __m128i* out_mm = (__m128i*)out;
186 
187  const __m128i* key_mm = (const __m128i*)&DK[0];
188 
189  __m128i K0 = _mm_loadu_si128(key_mm);
190  __m128i K1 = _mm_loadu_si128(key_mm + 1);
191  __m128i K2 = _mm_loadu_si128(key_mm + 2);
192  __m128i K3 = _mm_loadu_si128(key_mm + 3);
193  __m128i K4 = _mm_loadu_si128(key_mm + 4);
194  __m128i K5 = _mm_loadu_si128(key_mm + 5);
195  __m128i K6 = _mm_loadu_si128(key_mm + 6);
196  __m128i K7 = _mm_loadu_si128(key_mm + 7);
197  __m128i K8 = _mm_loadu_si128(key_mm + 8);
198  __m128i K9 = _mm_loadu_si128(key_mm + 9);
199  __m128i K10 = _mm_loadu_si128(key_mm + 10);
200 
201  while(blocks >= 4)
202  {
203  __m128i B0 = _mm_loadu_si128(in_mm + 0);
204  __m128i B1 = _mm_loadu_si128(in_mm + 1);
205  __m128i B2 = _mm_loadu_si128(in_mm + 2);
206  __m128i B3 = _mm_loadu_si128(in_mm + 3);
207 
208  B0 = _mm_xor_si128(B0, K0);
209  B1 = _mm_xor_si128(B1, K0);
210  B2 = _mm_xor_si128(B2, K0);
211  B3 = _mm_xor_si128(B3, K0);
212 
213  AES_DEC_4_ROUNDS(K1);
214  AES_DEC_4_ROUNDS(K2);
215  AES_DEC_4_ROUNDS(K3);
216  AES_DEC_4_ROUNDS(K4);
217  AES_DEC_4_ROUNDS(K5);
218  AES_DEC_4_ROUNDS(K6);
219  AES_DEC_4_ROUNDS(K7);
220  AES_DEC_4_ROUNDS(K8);
221  AES_DEC_4_ROUNDS(K9);
223 
224  _mm_storeu_si128(out_mm + 0, B0);
225  _mm_storeu_si128(out_mm + 1, B1);
226  _mm_storeu_si128(out_mm + 2, B2);
227  _mm_storeu_si128(out_mm + 3, B3);
228 
229  blocks -= 4;
230  in_mm += 4;
231  out_mm += 4;
232  }
233 
234  for(size_t i = 0; i != blocks; ++i)
235  {
236  __m128i B = _mm_loadu_si128(in_mm + i);
237 
238  B = _mm_xor_si128(B, K0);
239 
240  B = _mm_aesdec_si128(B, K1);
241  B = _mm_aesdec_si128(B, K2);
242  B = _mm_aesdec_si128(B, K3);
243  B = _mm_aesdec_si128(B, K4);
244  B = _mm_aesdec_si128(B, K5);
245  B = _mm_aesdec_si128(B, K6);
246  B = _mm_aesdec_si128(B, K7);
247  B = _mm_aesdec_si128(B, K8);
248  B = _mm_aesdec_si128(B, K9);
249  B = _mm_aesdeclast_si128(B, K10);
250 
251  _mm_storeu_si128(out_mm + i, B);
252  }
253  }
254 
255 /*
256 * AES-128 Key Schedule
257 */
258 void AES_128_NI::key_schedule(const byte key[], size_t)
259  {
260  #define AES_128_key_exp(K, RCON) \
261  aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
262 
263  __m128i K0 = _mm_loadu_si128((const __m128i*)(key));
264  __m128i K1 = AES_128_key_exp(K0, 0x01);
265  __m128i K2 = AES_128_key_exp(K1, 0x02);
266  __m128i K3 = AES_128_key_exp(K2, 0x04);
267  __m128i K4 = AES_128_key_exp(K3, 0x08);
268  __m128i K5 = AES_128_key_exp(K4, 0x10);
269  __m128i K6 = AES_128_key_exp(K5, 0x20);
270  __m128i K7 = AES_128_key_exp(K6, 0x40);
271  __m128i K8 = AES_128_key_exp(K7, 0x80);
272  __m128i K9 = AES_128_key_exp(K8, 0x1B);
273  __m128i K10 = AES_128_key_exp(K9, 0x36);
274 
275  __m128i* EK_mm = (__m128i*)&EK[0];
276  _mm_storeu_si128(EK_mm , K0);
277  _mm_storeu_si128(EK_mm + 1, K1);
278  _mm_storeu_si128(EK_mm + 2, K2);
279  _mm_storeu_si128(EK_mm + 3, K3);
280  _mm_storeu_si128(EK_mm + 4, K4);
281  _mm_storeu_si128(EK_mm + 5, K5);
282  _mm_storeu_si128(EK_mm + 6, K6);
283  _mm_storeu_si128(EK_mm + 7, K7);
284  _mm_storeu_si128(EK_mm + 8, K8);
285  _mm_storeu_si128(EK_mm + 9, K9);
286  _mm_storeu_si128(EK_mm + 10, K10);
287 
288  // Now generate decryption keys
289 
290  __m128i* DK_mm = (__m128i*)&DK[0];
291  _mm_storeu_si128(DK_mm , K10);
292  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
293  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
294  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
295  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
296  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
297  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
298  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
299  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
300  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
301  _mm_storeu_si128(DK_mm + 10, K0);
302  }
303 
304 /*
305 * Clear memory of sensitive data
306 */
308  {
309  zeroise(EK);
310  zeroise(DK);
311  }
312 
313 /*
314 * AES-192 Encryption
315 */
316 void AES_192_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const
317  {
318  const __m128i* in_mm = (const __m128i*)in;
319  __m128i* out_mm = (__m128i*)out;
320 
321  const __m128i* key_mm = (const __m128i*)&EK[0];
322 
323  __m128i K0 = _mm_loadu_si128(key_mm);
324  __m128i K1 = _mm_loadu_si128(key_mm + 1);
325  __m128i K2 = _mm_loadu_si128(key_mm + 2);
326  __m128i K3 = _mm_loadu_si128(key_mm + 3);
327  __m128i K4 = _mm_loadu_si128(key_mm + 4);
328  __m128i K5 = _mm_loadu_si128(key_mm + 5);
329  __m128i K6 = _mm_loadu_si128(key_mm + 6);
330  __m128i K7 = _mm_loadu_si128(key_mm + 7);
331  __m128i K8 = _mm_loadu_si128(key_mm + 8);
332  __m128i K9 = _mm_loadu_si128(key_mm + 9);
333  __m128i K10 = _mm_loadu_si128(key_mm + 10);
334  __m128i K11 = _mm_loadu_si128(key_mm + 11);
335  __m128i K12 = _mm_loadu_si128(key_mm + 12);
336 
337  while(blocks >= 4)
338  {
339  __m128i B0 = _mm_loadu_si128(in_mm + 0);
340  __m128i B1 = _mm_loadu_si128(in_mm + 1);
341  __m128i B2 = _mm_loadu_si128(in_mm + 2);
342  __m128i B3 = _mm_loadu_si128(in_mm + 3);
343 
344  B0 = _mm_xor_si128(B0, K0);
345  B1 = _mm_xor_si128(B1, K0);
346  B2 = _mm_xor_si128(B2, K0);
347  B3 = _mm_xor_si128(B3, K0);
348 
349  AES_ENC_4_ROUNDS(K1);
350  AES_ENC_4_ROUNDS(K2);
351  AES_ENC_4_ROUNDS(K3);
352  AES_ENC_4_ROUNDS(K4);
353  AES_ENC_4_ROUNDS(K5);
354  AES_ENC_4_ROUNDS(K6);
355  AES_ENC_4_ROUNDS(K7);
356  AES_ENC_4_ROUNDS(K8);
357  AES_ENC_4_ROUNDS(K9);
358  AES_ENC_4_ROUNDS(K10);
359  AES_ENC_4_ROUNDS(K11);
361 
362  _mm_storeu_si128(out_mm + 0, B0);
363  _mm_storeu_si128(out_mm + 1, B1);
364  _mm_storeu_si128(out_mm + 2, B2);
365  _mm_storeu_si128(out_mm + 3, B3);
366 
367  blocks -= 4;
368  in_mm += 4;
369  out_mm += 4;
370  }
371 
372  for(size_t i = 0; i != blocks; ++i)
373  {
374  __m128i B = _mm_loadu_si128(in_mm + i);
375 
376  B = _mm_xor_si128(B, K0);
377 
378  B = _mm_aesenc_si128(B, K1);
379  B = _mm_aesenc_si128(B, K2);
380  B = _mm_aesenc_si128(B, K3);
381  B = _mm_aesenc_si128(B, K4);
382  B = _mm_aesenc_si128(B, K5);
383  B = _mm_aesenc_si128(B, K6);
384  B = _mm_aesenc_si128(B, K7);
385  B = _mm_aesenc_si128(B, K8);
386  B = _mm_aesenc_si128(B, K9);
387  B = _mm_aesenc_si128(B, K10);
388  B = _mm_aesenc_si128(B, K11);
389  B = _mm_aesenclast_si128(B, K12);
390 
391  _mm_storeu_si128(out_mm + i, B);
392  }
393  }
394 
395 /*
396 * AES-192 Decryption
397 */
398 void AES_192_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const
399  {
400  const __m128i* in_mm = (const __m128i*)in;
401  __m128i* out_mm = (__m128i*)out;
402 
403  const __m128i* key_mm = (const __m128i*)&DK[0];
404 
405  __m128i K0 = _mm_loadu_si128(key_mm);
406  __m128i K1 = _mm_loadu_si128(key_mm + 1);
407  __m128i K2 = _mm_loadu_si128(key_mm + 2);
408  __m128i K3 = _mm_loadu_si128(key_mm + 3);
409  __m128i K4 = _mm_loadu_si128(key_mm + 4);
410  __m128i K5 = _mm_loadu_si128(key_mm + 5);
411  __m128i K6 = _mm_loadu_si128(key_mm + 6);
412  __m128i K7 = _mm_loadu_si128(key_mm + 7);
413  __m128i K8 = _mm_loadu_si128(key_mm + 8);
414  __m128i K9 = _mm_loadu_si128(key_mm + 9);
415  __m128i K10 = _mm_loadu_si128(key_mm + 10);
416  __m128i K11 = _mm_loadu_si128(key_mm + 11);
417  __m128i K12 = _mm_loadu_si128(key_mm + 12);
418 
419  while(blocks >= 4)
420  {
421  __m128i B0 = _mm_loadu_si128(in_mm + 0);
422  __m128i B1 = _mm_loadu_si128(in_mm + 1);
423  __m128i B2 = _mm_loadu_si128(in_mm + 2);
424  __m128i B3 = _mm_loadu_si128(in_mm + 3);
425 
426  B0 = _mm_xor_si128(B0, K0);
427  B1 = _mm_xor_si128(B1, K0);
428  B2 = _mm_xor_si128(B2, K0);
429  B3 = _mm_xor_si128(B3, K0);
430 
431  AES_DEC_4_ROUNDS(K1);
432  AES_DEC_4_ROUNDS(K2);
433  AES_DEC_4_ROUNDS(K3);
434  AES_DEC_4_ROUNDS(K4);
435  AES_DEC_4_ROUNDS(K5);
436  AES_DEC_4_ROUNDS(K6);
437  AES_DEC_4_ROUNDS(K7);
438  AES_DEC_4_ROUNDS(K8);
439  AES_DEC_4_ROUNDS(K9);
440  AES_DEC_4_ROUNDS(K10);
441  AES_DEC_4_ROUNDS(K11);
443 
444  _mm_storeu_si128(out_mm + 0, B0);
445  _mm_storeu_si128(out_mm + 1, B1);
446  _mm_storeu_si128(out_mm + 2, B2);
447  _mm_storeu_si128(out_mm + 3, B3);
448 
449  blocks -= 4;
450  in_mm += 4;
451  out_mm += 4;
452  }
453 
454  for(size_t i = 0; i != blocks; ++i)
455  {
456  __m128i B = _mm_loadu_si128(in_mm + i);
457 
458  B = _mm_xor_si128(B, K0);
459 
460  B = _mm_aesdec_si128(B, K1);
461  B = _mm_aesdec_si128(B, K2);
462  B = _mm_aesdec_si128(B, K3);
463  B = _mm_aesdec_si128(B, K4);
464  B = _mm_aesdec_si128(B, K5);
465  B = _mm_aesdec_si128(B, K6);
466  B = _mm_aesdec_si128(B, K7);
467  B = _mm_aesdec_si128(B, K8);
468  B = _mm_aesdec_si128(B, K9);
469  B = _mm_aesdec_si128(B, K10);
470  B = _mm_aesdec_si128(B, K11);
471  B = _mm_aesdeclast_si128(B, K12);
472 
473  _mm_storeu_si128(out_mm + i, B);
474  }
475  }
476 
477 /*
478 * AES-192 Key Schedule
479 */
480 void AES_192_NI::key_schedule(const byte key[], size_t)
481  {
482  __m128i K0 = _mm_loadu_si128((const __m128i*)(key));
483  __m128i K1 = _mm_loadu_si128((const __m128i*)(key + 8));
484  K1 = _mm_srli_si128(K1, 8);
485 
486  load_le(&EK[0], key, 6);
487 
488 #define AES_192_key_exp(RCON, EK_OFF) \
489  aes_192_key_expansion(&K0, &K1, \
490  _mm_aeskeygenassist_si128(K1, RCON), \
491  EK + EK_OFF, EK_OFF == 48)
492 
493  AES_192_key_exp(0x01, 6);
494  AES_192_key_exp(0x02, 12);
495  AES_192_key_exp(0x04, 18);
496  AES_192_key_exp(0x08, 24);
497  AES_192_key_exp(0x10, 30);
498  AES_192_key_exp(0x20, 36);
499  AES_192_key_exp(0x40, 42);
500  AES_192_key_exp(0x80, 48);
501 
502  // Now generate decryption keys
503  const __m128i* EK_mm = (const __m128i*)&EK[0];
504  __m128i* DK_mm = (__m128i*)&DK[0];
505  _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12));
506  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
507  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
508  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
509  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
510  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
511  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
512  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
513  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
514  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
515  _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
516  _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
517  _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
518  }
519 
520 /*
521 * Clear memory of sensitive data
522 */
524  {
525  zeroise(EK);
526  zeroise(DK);
527  }
528 
529 /*
530 * AES-256 Encryption
531 */
532 void AES_256_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const
533  {
534  const __m128i* in_mm = (const __m128i*)in;
535  __m128i* out_mm = (__m128i*)out;
536 
537  const __m128i* key_mm = (const __m128i*)&EK[0];
538 
539  __m128i K0 = _mm_loadu_si128(key_mm);
540  __m128i K1 = _mm_loadu_si128(key_mm + 1);
541  __m128i K2 = _mm_loadu_si128(key_mm + 2);
542  __m128i K3 = _mm_loadu_si128(key_mm + 3);
543  __m128i K4 = _mm_loadu_si128(key_mm + 4);
544  __m128i K5 = _mm_loadu_si128(key_mm + 5);
545  __m128i K6 = _mm_loadu_si128(key_mm + 6);
546  __m128i K7 = _mm_loadu_si128(key_mm + 7);
547  __m128i K8 = _mm_loadu_si128(key_mm + 8);
548  __m128i K9 = _mm_loadu_si128(key_mm + 9);
549  __m128i K10 = _mm_loadu_si128(key_mm + 10);
550  __m128i K11 = _mm_loadu_si128(key_mm + 11);
551  __m128i K12 = _mm_loadu_si128(key_mm + 12);
552  __m128i K13 = _mm_loadu_si128(key_mm + 13);
553  __m128i K14 = _mm_loadu_si128(key_mm + 14);
554 
555  while(blocks >= 4)
556  {
557  __m128i B0 = _mm_loadu_si128(in_mm + 0);
558  __m128i B1 = _mm_loadu_si128(in_mm + 1);
559  __m128i B2 = _mm_loadu_si128(in_mm + 2);
560  __m128i B3 = _mm_loadu_si128(in_mm + 3);
561 
562  B0 = _mm_xor_si128(B0, K0);
563  B1 = _mm_xor_si128(B1, K0);
564  B2 = _mm_xor_si128(B2, K0);
565  B3 = _mm_xor_si128(B3, K0);
566 
567  AES_ENC_4_ROUNDS(K1);
568  AES_ENC_4_ROUNDS(K2);
569  AES_ENC_4_ROUNDS(K3);
570  AES_ENC_4_ROUNDS(K4);
571  AES_ENC_4_ROUNDS(K5);
572  AES_ENC_4_ROUNDS(K6);
573  AES_ENC_4_ROUNDS(K7);
574  AES_ENC_4_ROUNDS(K8);
575  AES_ENC_4_ROUNDS(K9);
576  AES_ENC_4_ROUNDS(K10);
577  AES_ENC_4_ROUNDS(K11);
578  AES_ENC_4_ROUNDS(K12);
579  AES_ENC_4_ROUNDS(K13);
581 
582  _mm_storeu_si128(out_mm + 0, B0);
583  _mm_storeu_si128(out_mm + 1, B1);
584  _mm_storeu_si128(out_mm + 2, B2);
585  _mm_storeu_si128(out_mm + 3, B3);
586 
587  blocks -= 4;
588  in_mm += 4;
589  out_mm += 4;
590  }
591 
592  for(size_t i = 0; i != blocks; ++i)
593  {
594  __m128i B = _mm_loadu_si128(in_mm + i);
595 
596  B = _mm_xor_si128(B, K0);
597 
598  B = _mm_aesenc_si128(B, K1);
599  B = _mm_aesenc_si128(B, K2);
600  B = _mm_aesenc_si128(B, K3);
601  B = _mm_aesenc_si128(B, K4);
602  B = _mm_aesenc_si128(B, K5);
603  B = _mm_aesenc_si128(B, K6);
604  B = _mm_aesenc_si128(B, K7);
605  B = _mm_aesenc_si128(B, K8);
606  B = _mm_aesenc_si128(B, K9);
607  B = _mm_aesenc_si128(B, K10);
608  B = _mm_aesenc_si128(B, K11);
609  B = _mm_aesenc_si128(B, K12);
610  B = _mm_aesenc_si128(B, K13);
611  B = _mm_aesenclast_si128(B, K14);
612 
613  _mm_storeu_si128(out_mm + i, B);
614  }
615  }
616 
617 /*
618 * AES-256 Decryption
619 */
620 void AES_256_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const
621  {
622  const __m128i* in_mm = (const __m128i*)in;
623  __m128i* out_mm = (__m128i*)out;
624 
625  const __m128i* key_mm = (const __m128i*)&DK[0];
626 
627  __m128i K0 = _mm_loadu_si128(key_mm);
628  __m128i K1 = _mm_loadu_si128(key_mm + 1);
629  __m128i K2 = _mm_loadu_si128(key_mm + 2);
630  __m128i K3 = _mm_loadu_si128(key_mm + 3);
631  __m128i K4 = _mm_loadu_si128(key_mm + 4);
632  __m128i K5 = _mm_loadu_si128(key_mm + 5);
633  __m128i K6 = _mm_loadu_si128(key_mm + 6);
634  __m128i K7 = _mm_loadu_si128(key_mm + 7);
635  __m128i K8 = _mm_loadu_si128(key_mm + 8);
636  __m128i K9 = _mm_loadu_si128(key_mm + 9);
637  __m128i K10 = _mm_loadu_si128(key_mm + 10);
638  __m128i K11 = _mm_loadu_si128(key_mm + 11);
639  __m128i K12 = _mm_loadu_si128(key_mm + 12);
640  __m128i K13 = _mm_loadu_si128(key_mm + 13);
641  __m128i K14 = _mm_loadu_si128(key_mm + 14);
642 
643  while(blocks >= 4)
644  {
645  __m128i B0 = _mm_loadu_si128(in_mm + 0);
646  __m128i B1 = _mm_loadu_si128(in_mm + 1);
647  __m128i B2 = _mm_loadu_si128(in_mm + 2);
648  __m128i B3 = _mm_loadu_si128(in_mm + 3);
649 
650  B0 = _mm_xor_si128(B0, K0);
651  B1 = _mm_xor_si128(B1, K0);
652  B2 = _mm_xor_si128(B2, K0);
653  B3 = _mm_xor_si128(B3, K0);
654 
655  AES_DEC_4_ROUNDS(K1);
656  AES_DEC_4_ROUNDS(K2);
657  AES_DEC_4_ROUNDS(K3);
658  AES_DEC_4_ROUNDS(K4);
659  AES_DEC_4_ROUNDS(K5);
660  AES_DEC_4_ROUNDS(K6);
661  AES_DEC_4_ROUNDS(K7);
662  AES_DEC_4_ROUNDS(K8);
663  AES_DEC_4_ROUNDS(K9);
664  AES_DEC_4_ROUNDS(K10);
665  AES_DEC_4_ROUNDS(K11);
666  AES_DEC_4_ROUNDS(K12);
667  AES_DEC_4_ROUNDS(K13);
669 
670  _mm_storeu_si128(out_mm + 0, B0);
671  _mm_storeu_si128(out_mm + 1, B1);
672  _mm_storeu_si128(out_mm + 2, B2);
673  _mm_storeu_si128(out_mm + 3, B3);
674 
675  blocks -= 4;
676  in_mm += 4;
677  out_mm += 4;
678  }
679 
680  for(size_t i = 0; i != blocks; ++i)
681  {
682  __m128i B = _mm_loadu_si128(in_mm + i);
683 
684  B = _mm_xor_si128(B, K0);
685 
686  B = _mm_aesdec_si128(B, K1);
687  B = _mm_aesdec_si128(B, K2);
688  B = _mm_aesdec_si128(B, K3);
689  B = _mm_aesdec_si128(B, K4);
690  B = _mm_aesdec_si128(B, K5);
691  B = _mm_aesdec_si128(B, K6);
692  B = _mm_aesdec_si128(B, K7);
693  B = _mm_aesdec_si128(B, K8);
694  B = _mm_aesdec_si128(B, K9);
695  B = _mm_aesdec_si128(B, K10);
696  B = _mm_aesdec_si128(B, K11);
697  B = _mm_aesdec_si128(B, K12);
698  B = _mm_aesdec_si128(B, K13);
699  B = _mm_aesdeclast_si128(B, K14);
700 
701  _mm_storeu_si128(out_mm + i, B);
702  }
703  }
704 
705 /*
706 * AES-256 Key Schedule
707 */
708 void AES_256_NI::key_schedule(const byte key[], size_t)
709  {
710  __m128i K0 = _mm_loadu_si128((const __m128i*)(key));
711  __m128i K1 = _mm_loadu_si128((const __m128i*)(key + 16));
712 
713  __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
714  __m128i K3 = aes_256_key_expansion(K1, K2);
715 
716  __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
717  __m128i K5 = aes_256_key_expansion(K3, K4);
718 
719  __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
720  __m128i K7 = aes_256_key_expansion(K5, K6);
721 
722  __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
723  __m128i K9 = aes_256_key_expansion(K7, K8);
724 
725  __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
726  __m128i K11 = aes_256_key_expansion(K9, K10);
727 
728  __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
729  __m128i K13 = aes_256_key_expansion(K11, K12);
730 
731  __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
732 
733  __m128i* EK_mm = (__m128i*)&EK[0];
734  _mm_storeu_si128(EK_mm , K0);
735  _mm_storeu_si128(EK_mm + 1, K1);
736  _mm_storeu_si128(EK_mm + 2, K2);
737  _mm_storeu_si128(EK_mm + 3, K3);
738  _mm_storeu_si128(EK_mm + 4, K4);
739  _mm_storeu_si128(EK_mm + 5, K5);
740  _mm_storeu_si128(EK_mm + 6, K6);
741  _mm_storeu_si128(EK_mm + 7, K7);
742  _mm_storeu_si128(EK_mm + 8, K8);
743  _mm_storeu_si128(EK_mm + 9, K9);
744  _mm_storeu_si128(EK_mm + 10, K10);
745  _mm_storeu_si128(EK_mm + 11, K11);
746  _mm_storeu_si128(EK_mm + 12, K12);
747  _mm_storeu_si128(EK_mm + 13, K13);
748  _mm_storeu_si128(EK_mm + 14, K14);
749 
750  // Now generate decryption keys
751 
752  __m128i* DK_mm = (__m128i*)&DK[0];
753  _mm_storeu_si128(DK_mm , K14);
754  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
755  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
756  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
757  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
758  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
759  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
760  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
761  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
762  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
763  _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
764  _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
765  _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
766  _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
767  _mm_storeu_si128(DK_mm + 14, K0);
768  }
769 
770 /*
771 * Clear memory of sensitive data
772 */
774  {
775  zeroise(EK);
776  zeroise(DK);
777  }
778 
779 }
#define AES_DEC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:94
T load_le(const byte in[], size_t off)
Definition: loadstor.h:116
void decrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: aes_ni.cpp:620
void decrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: aes_ni.cpp:398
unsigned char byte
Definition: types.h:22
#define AES_192_key_exp(RCON, EK_OFF)
#define AES_ENC_4_ROUNDS(K)
Definition: aes_ni.cpp:67
#define AES_DEC_4_ROUNDS(K)
Definition: aes_ni.cpp:85
#define AES_ENC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:76
void encrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: aes_ni.cpp:106
void decrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: aes_ni.cpp:182
void encrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: aes_ni.cpp:316
#define AES_128_key_exp(K, RCON)
void encrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: aes_ni.cpp:532
void zeroise(MemoryRegion< T > &vec)
Definition: secmem.h:415
unsigned int u32bit
Definition: types.h:32