Botan  1.10.9
idea_sse2.cpp
Go to the documentation of this file.
1 /*
2 * IDEA in SSE2
3 * (C) 2009 Jack Lloyd
4 *
5 * Distributed under the terms of the Botan license
6 */
7 
8 #include <botan/idea_sse2.h>
9 #include <emmintrin.h>
10 
11 namespace Botan {
12 
13 namespace {
14 
15 inline __m128i mul(__m128i X, u16bit K_16)
16  {
17  const __m128i zeros = _mm_set1_epi16(0);
18  const __m128i ones = _mm_set1_epi16(1);
19 
20  const __m128i K = _mm_set1_epi16(K_16);
21 
22  const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
23  const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
24 
25  const __m128i mul_lo = _mm_mullo_epi16(X, K);
26  const __m128i mul_hi = _mm_mulhi_epu16(X, K);
27 
28  __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
29 
30  // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
31  const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
32  const __m128i cmp = _mm_min_epu8(
33  _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
34 
35  T = _mm_add_epi16(T, cmp);
36 
37  /* Selection: if X[i] is zero then assign 1-K
38  if K is zero then assign 1-X[i]
39 
40  Could if() off value of K_16 for the second, but this gives a
41  constant time implementation which is a nice bonus.
42  */
43 
44  T = _mm_or_si128(
45  _mm_andnot_si128(X_is_zero, T),
46  _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
47 
48  T = _mm_or_si128(
49  _mm_andnot_si128(K_is_zero, T),
50  _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
51 
52  return T;
53  }
54 
55 /*
56 * 4x8 matrix transpose
57 *
58 * FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
59 * transpose_out doesn't need it. Something with the shuffle? Removing
60 * that extra unpack could easily save 3-4 cycles per block, and would
61 * also help a lot with register pressure on 32-bit x86
62 */
63 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
64  {
65  __m128i T0 = _mm_unpackhi_epi32(B0, B1);
66  __m128i T1 = _mm_unpacklo_epi32(B0, B1);
67  __m128i T2 = _mm_unpackhi_epi32(B2, B3);
68  __m128i T3 = _mm_unpacklo_epi32(B2, B3);
69 
70  __m128i T4 = _mm_unpacklo_epi32(T0, T1);
71  __m128i T5 = _mm_unpackhi_epi32(T0, T1);
72  __m128i T6 = _mm_unpacklo_epi32(T2, T3);
73  __m128i T7 = _mm_unpackhi_epi32(T2, T3);
74 
75  T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
76  T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
77  T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
78  T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
79 
80  T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
81  T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
82  T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
83  T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
84 
85  T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
86  T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
87  T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
88  T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
89 
90  B0 = _mm_unpacklo_epi64(T0, T2);
91  B1 = _mm_unpackhi_epi64(T0, T2);
92  B2 = _mm_unpacklo_epi64(T1, T3);
93  B3 = _mm_unpackhi_epi64(T1, T3);
94  }
95 
96 /*
97 * 4x8 matrix transpose (reverse)
98 */
99 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
100  {
101  __m128i T0 = _mm_unpacklo_epi64(B0, B1);
102  __m128i T1 = _mm_unpacklo_epi64(B2, B3);
103  __m128i T2 = _mm_unpackhi_epi64(B0, B1);
104  __m128i T3 = _mm_unpackhi_epi64(B2, B3);
105 
106  T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
107  T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
108  T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
109  T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
110 
111  T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
112  T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
113  T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
114  T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
115 
116  T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
117  T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
118  T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
119  T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
120 
121  B0 = _mm_unpacklo_epi32(T0, T1);
122  B1 = _mm_unpackhi_epi32(T0, T1);
123  B2 = _mm_unpacklo_epi32(T2, T3);
124  B3 = _mm_unpackhi_epi32(T2, T3);
125  }
126 
127 /*
128 * IDEA encryption/decryption in SSE2
129 */
130 void idea_op_8(const byte in[64], byte out[64], const u16bit EK[52])
131  {
132  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
133 
134  __m128i B0 = _mm_loadu_si128(in_mm + 0);
135  __m128i B1 = _mm_loadu_si128(in_mm + 1);
136  __m128i B2 = _mm_loadu_si128(in_mm + 2);
137  __m128i B3 = _mm_loadu_si128(in_mm + 3);
138 
139  transpose_in(B0, B1, B2, B3);
140 
141  // byte swap
142  B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
143  B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
144  B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
145  B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
146 
147  for(size_t i = 0; i != 8; ++i)
148  {
149  B0 = mul(B0, EK[6*i+0]);
150  B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
151  B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
152  B3 = mul(B3, EK[6*i+3]);
153 
154  __m128i T0 = B2;
155 
156  B2 = _mm_xor_si128(B2, B0);
157  B2 = mul(B2, EK[6*i+4]);
158 
159  __m128i T1 = B1;
160 
161  B1 = _mm_xor_si128(B1, B3);
162  B1 = _mm_add_epi16(B1, B2);
163  B1 = mul(B1, EK[6*i+5]);
164 
165  B2 = _mm_add_epi16(B2, B1);
166 
167  B0 = _mm_xor_si128(B0, B1);
168  B1 = _mm_xor_si128(B1, T0);
169  B3 = _mm_xor_si128(B3, B2);
170  B2 = _mm_xor_si128(B2, T1);
171  }
172 
173  B0 = mul(B0, EK[48]);
174  B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
175  B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
176  B3 = mul(B3, EK[51]);
177 
178  // byte swap
179  B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
180  B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
181  B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
182  B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
183 
184  transpose_out(B0, B2, B1, B3);
185 
186  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
187 
188  _mm_storeu_si128(out_mm + 0, B0);
189  _mm_storeu_si128(out_mm + 1, B2);
190  _mm_storeu_si128(out_mm + 2, B1);
191  _mm_storeu_si128(out_mm + 3, B3);
192  }
193 
194 }
195 
196 /*
197 * IDEA Encryption
198 */
199 void IDEA_SSE2::encrypt_n(const byte in[], byte out[], size_t blocks) const
200  {
201  const u16bit* KS = &this->get_EK()[0];
202 
203  while(blocks >= 8)
204  {
205  idea_op_8(in, out, KS);
206  in += 8 * BLOCK_SIZE;
207  out += 8 * BLOCK_SIZE;
208  blocks -= 8;
209  }
210 
211  if(blocks)
212  IDEA::encrypt_n(in, out, blocks);
213  }
214 
215 /*
216 * IDEA Decryption
217 */
218 void IDEA_SSE2::decrypt_n(const byte in[], byte out[], size_t blocks) const
219  {
220  const u16bit* KS = &this->get_DK()[0];
221 
222  while(blocks >= 8)
223  {
224  idea_op_8(in, out, KS);
225  in += 8 * BLOCK_SIZE;
226  out += 8 * BLOCK_SIZE;
227  blocks -= 8;
228  }
229 
230  if(blocks)
231  IDEA::decrypt_n(in, out, blocks);
232  }
233 
234 }
void encrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: idea_sse2.cpp:199
const SecureVector< u16bit > & get_EK() const
Definition: idea.h:33
void decrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: idea_sse2.cpp:218
unsigned char byte
Definition: types.h:22
unsigned short u16bit
Definition: types.h:27
void decrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: idea.cpp:117
void encrypt_n(const byte in[], byte out[], size_t blocks) const
Definition: idea.cpp:109
const SecureVector< u16bit > & get_DK() const
Definition: idea.h:38