8 #include <botan/idea_sse2.h>
15 inline __m128i mul(__m128i X,
u16bit K_16)
17 const __m128i zeros = _mm_set1_epi16(0);
18 const __m128i ones = _mm_set1_epi16(1);
20 const __m128i K = _mm_set1_epi16(K_16);
22 const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
23 const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
25 const __m128i mul_lo = _mm_mullo_epi16(X, K);
26 const __m128i mul_hi = _mm_mulhi_epu16(X, K);
28 __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
31 const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
32 const __m128i cmp = _mm_min_epu8(
33 _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
35 T = _mm_add_epi16(T, cmp);
45 _mm_andnot_si128(X_is_zero, T),
46 _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
49 _mm_andnot_si128(K_is_zero, T),
50 _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
63 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
65 __m128i T0 = _mm_unpackhi_epi32(B0, B1);
66 __m128i T1 = _mm_unpacklo_epi32(B0, B1);
67 __m128i T2 = _mm_unpackhi_epi32(B2, B3);
68 __m128i T3 = _mm_unpacklo_epi32(B2, B3);
70 __m128i T4 = _mm_unpacklo_epi32(T0, T1);
71 __m128i T5 = _mm_unpackhi_epi32(T0, T1);
72 __m128i T6 = _mm_unpacklo_epi32(T2, T3);
73 __m128i T7 = _mm_unpackhi_epi32(T2, T3);
75 T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
76 T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
77 T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
78 T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
80 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
81 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
82 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
83 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
85 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
86 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
87 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
88 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
90 B0 = _mm_unpacklo_epi64(T0, T2);
91 B1 = _mm_unpackhi_epi64(T0, T2);
92 B2 = _mm_unpacklo_epi64(T1, T3);
93 B3 = _mm_unpackhi_epi64(T1, T3);
99 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
101 __m128i T0 = _mm_unpacklo_epi64(B0, B1);
102 __m128i T1 = _mm_unpacklo_epi64(B2, B3);
103 __m128i T2 = _mm_unpackhi_epi64(B0, B1);
104 __m128i T3 = _mm_unpackhi_epi64(B2, B3);
106 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
107 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
108 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
109 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
111 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
112 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
113 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
114 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
116 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
117 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
118 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
119 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
121 B0 = _mm_unpacklo_epi32(T0, T1);
122 B1 = _mm_unpackhi_epi32(T0, T1);
123 B2 = _mm_unpacklo_epi32(T2, T3);
124 B3 = _mm_unpackhi_epi32(T2, T3);
130 void idea_op_8(
const byte in[64],
byte out[64],
const u16bit EK[52])
132 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
134 __m128i B0 = _mm_loadu_si128(in_mm + 0);
135 __m128i B1 = _mm_loadu_si128(in_mm + 1);
136 __m128i B2 = _mm_loadu_si128(in_mm + 2);
137 __m128i B3 = _mm_loadu_si128(in_mm + 3);
139 transpose_in(B0, B1, B2, B3);
142 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
143 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
144 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
145 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
147 for(
size_t i = 0; i != 8; ++i)
149 B0 = mul(B0, EK[6*i+0]);
150 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
151 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
152 B3 = mul(B3, EK[6*i+3]);
156 B2 = _mm_xor_si128(B2, B0);
157 B2 = mul(B2, EK[6*i+4]);
161 B1 = _mm_xor_si128(B1, B3);
162 B1 = _mm_add_epi16(B1, B2);
163 B1 = mul(B1, EK[6*i+5]);
165 B2 = _mm_add_epi16(B2, B1);
167 B0 = _mm_xor_si128(B0, B1);
168 B1 = _mm_xor_si128(B1, T0);
169 B3 = _mm_xor_si128(B3, B2);
170 B2 = _mm_xor_si128(B2, T1);
173 B0 = mul(B0, EK[48]);
174 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
175 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
176 B3 = mul(B3, EK[51]);
179 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
180 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
181 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
182 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
184 transpose_out(B0, B2, B1, B3);
186 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
188 _mm_storeu_si128(out_mm + 0, B0);
189 _mm_storeu_si128(out_mm + 1, B2);
190 _mm_storeu_si128(out_mm + 2, B1);
191 _mm_storeu_si128(out_mm + 3, B3);
205 idea_op_8(in, out, KS);
224 idea_op_8(in, out, KS);
void encrypt_n(const byte in[], byte out[], size_t blocks) const
const SecureVector< u16bit > & get_EK() const
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void encrypt_n(const byte in[], byte out[], size_t blocks) const
const SecureVector< u16bit > & get_DK() const