2 Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
4 The redistribution and use of this software (with or without changes)
5 is allowed without the payment of fees or royalties provided that:
7 source code distributions include the above copyright notice, this
8 list of conditions and the following disclaimer;
10 binary distributions include the above copyright notice, this list
11 of conditions and the following disclaimer in their documentation.
13 This software is provided 'as is' with no explicit or implied warranties
14 in respect of its operation, including, but not limited to, correctness
15 and fitness for purpose.
16 ---------------------------------------------------------------------------
17 Issue Date: 09/09/2014
22 #if defined( USE_INTEL_AES_IF_PRESENT )
27 #pragma intrinsic(__cpuid)
28 #define INLINE __inline
30 INLINE int has_aes_ni(void)
37 test = cpu_info[2] & 0x02000000;
42 #elif defined( __GNUC__ )
46 #if !defined(__clang__)
47 #pragma GCC target ("ssse3")
48 #pragma GCC target ("sse4.1")
49 #pragma GCC target ("aes")
52 #include <x86intrin.h>
53 #define INLINE static __inline
55 INLINE int has_aes_ni()
60 unsigned int a, b, c, d;
61 if(!__get_cpuid(1, &a, &b, &c, &d))
64 test = (c & 0x2000000);
70 #error AES New Instructions require Microsoft, Intel, GNU C, or CLANG
73 INLINE __m128i aes_128_assist(__m128i t1, __m128i t2)
76 t2 = _mm_shuffle_epi32(t2, 0xff);
77 t3 = _mm_slli_si128(t1, 0x4);
78 t1 = _mm_xor_si128(t1, t3);
79 t3 = _mm_slli_si128(t3, 0x4);
80 t1 = _mm_xor_si128(t1, t3);
81 t3 = _mm_slli_si128(t3, 0x4);
82 t1 = _mm_xor_si128(t1, t3);
83 t1 = _mm_xor_si128(t1, t2);
87 AES_RETURN aes_ni(encrypt_key128)(const unsigned char *key, aes_encrypt_ctx cx[1])
90 __m128i *ks = (__m128i*)cx->ks;
94 return aes_xi(encrypt_key128)(key, cx);
97 t1 = _mm_loadu_si128((__m128i*)key);
101 t2 = _mm_aeskeygenassist_si128(t1, 0x1);
102 t1 = aes_128_assist(t1, t2);
105 t2 = _mm_aeskeygenassist_si128(t1, 0x2);
106 t1 = aes_128_assist(t1, t2);
109 t2 = _mm_aeskeygenassist_si128(t1, 0x4);
110 t1 = aes_128_assist(t1, t2);
113 t2 = _mm_aeskeygenassist_si128(t1, 0x8);
114 t1 = aes_128_assist(t1, t2);
117 t2 = _mm_aeskeygenassist_si128(t1, 0x10);
118 t1 = aes_128_assist(t1, t2);
121 t2 = _mm_aeskeygenassist_si128(t1, 0x20);
122 t1 = aes_128_assist(t1, t2);
125 t2 = _mm_aeskeygenassist_si128(t1, 0x40);
126 t1 = aes_128_assist(t1, t2);
129 t2 = _mm_aeskeygenassist_si128(t1, 0x80);
130 t1 = aes_128_assist(t1, t2);
133 t2 = _mm_aeskeygenassist_si128(t1, 0x1b);
134 t1 = aes_128_assist(t1, t2);
137 t2 = _mm_aeskeygenassist_si128(t1, 0x36);
138 t1 = aes_128_assist(t1, t2);
142 cx->inf.b[0] = 10 * 16;
146 INLINE void aes_192_assist(__m128i* t1, __m128i * t2, __m128i * t3)
149 *t2 = _mm_shuffle_epi32(*t2, 0x55);
150 t4 = _mm_slli_si128(*t1, 0x4);
151 *t1 = _mm_xor_si128(*t1, t4);
152 t4 = _mm_slli_si128(t4, 0x4);
153 *t1 = _mm_xor_si128(*t1, t4);
154 t4 = _mm_slli_si128(t4, 0x4);
155 *t1 = _mm_xor_si128(*t1, t4);
156 *t1 = _mm_xor_si128(*t1, *t2);
157 *t2 = _mm_shuffle_epi32(*t1, 0xff);
158 t4 = _mm_slli_si128(*t3, 0x4);
159 *t3 = _mm_xor_si128(*t3, t4);
160 *t3 = _mm_xor_si128(*t3, *t2);
163 AES_RETURN aes_ni(encrypt_key192)(const unsigned char *key, aes_encrypt_ctx cx[1])
166 __m128i *ks = (__m128i*)cx->ks;
170 return aes_xi(encrypt_key192)(key, cx);
173 t1 = _mm_loadu_si128((__m128i*)key);
174 t3 = _mm_loadu_si128((__m128i*)(key + 16));
179 t2 = _mm_aeskeygenassist_si128(t3, 0x1);
180 aes_192_assist(&t1, &t2, &t3);
182 ks[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[1]), _mm_castsi128_pd(t1), 0));
183 ks[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
185 t2 = _mm_aeskeygenassist_si128(t3, 0x2);
186 aes_192_assist(&t1, &t2, &t3);
190 t2 = _mm_aeskeygenassist_si128(t3, 0x4);
191 aes_192_assist(&t1, &t2, &t3);
192 ks[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[4]), _mm_castsi128_pd(t1), 0));
193 ks[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
195 t2 = _mm_aeskeygenassist_si128(t3, 0x8);
196 aes_192_assist(&t1, &t2, &t3);
200 t2 = _mm_aeskeygenassist_si128(t3, 0x10);
201 aes_192_assist(&t1, &t2, &t3);
202 ks[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[7]), _mm_castsi128_pd(t1), 0));
203 ks[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
205 t2 = _mm_aeskeygenassist_si128(t3, 0x20);
206 aes_192_assist(&t1, &t2, &t3);
210 t2 = _mm_aeskeygenassist_si128(t3, 0x40);
211 aes_192_assist(&t1, &t2, &t3);
212 ks[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[10]), _mm_castsi128_pd(t1), 0));
213 ks[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
215 t2 = _mm_aeskeygenassist_si128(t3, 0x80);
216 aes_192_assist(&t1, &t2, &t3);
220 cx->inf.b[0] = 12 * 16;
224 INLINE void aes_256_assist1(__m128i* t1, __m128i * t2)
227 *t2 = _mm_shuffle_epi32(*t2, 0xff);
228 t4 = _mm_slli_si128(*t1, 0x4);
229 *t1 = _mm_xor_si128(*t1, t4);
230 t4 = _mm_slli_si128(t4, 0x4);
231 *t1 = _mm_xor_si128(*t1, t4);
232 t4 = _mm_slli_si128(t4, 0x4);
233 *t1 = _mm_xor_si128(*t1, t4);
234 *t1 = _mm_xor_si128(*t1, *t2);
237 INLINE void aes_256_assist2(__m128i* t1, __m128i * t3)
240 t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
241 t2 = _mm_shuffle_epi32(t4, 0xaa);
242 t4 = _mm_slli_si128(*t3, 0x4);
243 *t3 = _mm_xor_si128(*t3, t4);
244 t4 = _mm_slli_si128(t4, 0x4);
245 *t3 = _mm_xor_si128(*t3, t4);
246 t4 = _mm_slli_si128(t4, 0x4);
247 *t3 = _mm_xor_si128(*t3, t4);
248 *t3 = _mm_xor_si128(*t3, t2);
251 AES_RETURN aes_ni(encrypt_key256)(const unsigned char *key, aes_encrypt_ctx cx[1])
254 __m128i *ks = (__m128i*)cx->ks;
258 return aes_xi(encrypt_key256)(key, cx);
261 t1 = _mm_loadu_si128((__m128i*)key);
262 t3 = _mm_loadu_si128((__m128i*)(key + 16));
267 t2 = _mm_aeskeygenassist_si128(t3, 0x01);
268 aes_256_assist1(&t1, &t2);
270 aes_256_assist2(&t1, &t3);
273 t2 = _mm_aeskeygenassist_si128(t3, 0x02);
274 aes_256_assist1(&t1, &t2);
276 aes_256_assist2(&t1, &t3);
279 t2 = _mm_aeskeygenassist_si128(t3, 0x04);
280 aes_256_assist1(&t1, &t2);
282 aes_256_assist2(&t1, &t3);
285 t2 = _mm_aeskeygenassist_si128(t3, 0x08);
286 aes_256_assist1(&t1, &t2);
288 aes_256_assist2(&t1, &t3);
291 t2 = _mm_aeskeygenassist_si128(t3, 0x10);
292 aes_256_assist1(&t1, &t2);
294 aes_256_assist2(&t1, &t3);
297 t2 = _mm_aeskeygenassist_si128(t3, 0x20);
298 aes_256_assist1(&t1, &t2);
300 aes_256_assist2(&t1, &t3);
303 t2 = _mm_aeskeygenassist_si128(t3, 0x40);
304 aes_256_assist1(&t1, &t2);
308 cx->inf.b[0] = 14 * 16;
312 INLINE void enc_to_dec(aes_decrypt_ctx cx[1])
314 __m128i *ks = (__m128i*)cx->ks;
317 for( j = 1 ; j < (cx->inf.b[0] >> 4) ; ++j )
318 ks[j] = _mm_aesimc_si128(ks[j]);
321 AES_RETURN aes_ni(decrypt_key128)(const unsigned char *key, aes_decrypt_ctx cx[1])
325 return aes_xi(decrypt_key128)(key, cx);
328 if(aes_ni(encrypt_key128)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
338 AES_RETURN aes_ni(decrypt_key192)(const unsigned char *key, aes_decrypt_ctx cx[1])
342 return aes_xi(decrypt_key192)(key, cx);
345 if(aes_ni(encrypt_key192)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
354 AES_RETURN aes_ni(decrypt_key256)(const unsigned char *key, aes_decrypt_ctx cx[1])
358 return aes_xi(decrypt_key256)(key, cx);
361 if(aes_ni(encrypt_key256)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
370 AES_RETURN aes_ni(encrypt)(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
372 __m128i *key = (__m128i*)cx->ks, t;
374 if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
379 return aes_xi(encrypt)(in, out, cx);
382 t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
387 t = _mm_aesenc_si128(t, *(__m128i*)++key);
388 t = _mm_aesenc_si128(t, *(__m128i*)++key);
390 t = _mm_aesenc_si128(t, *(__m128i*)++key);
391 t = _mm_aesenc_si128(t, *(__m128i*)++key);
393 t = _mm_aesenc_si128(t, *(__m128i*)++key);
394 t = _mm_aesenc_si128(t, *(__m128i*)++key);
395 t = _mm_aesenc_si128(t, *(__m128i*)++key);
396 t = _mm_aesenc_si128(t, *(__m128i*)++key);
397 t = _mm_aesenc_si128(t, *(__m128i*)++key);
398 t = _mm_aesenc_si128(t, *(__m128i*)++key);
399 t = _mm_aesenc_si128(t, *(__m128i*)++key);
400 t = _mm_aesenc_si128(t, *(__m128i*)++key);
401 t = _mm_aesenc_si128(t, *(__m128i*)++key);
402 t = _mm_aesenclast_si128(t, *(__m128i*)++key);
405 _mm_storeu_si128(&((__m128i*)out)[0], t);
409 AES_RETURN aes_ni(decrypt)(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
411 __m128i *key = (__m128i*)cx->ks + (cx->inf.b[0] >> 4), t;
413 if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
418 return aes_xi(decrypt)(in, out, cx);
421 t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
426 t = _mm_aesdec_si128(t, *(__m128i*)--key);
427 t = _mm_aesdec_si128(t, *(__m128i*)--key);
429 t = _mm_aesdec_si128(t, *(__m128i*)--key);
430 t = _mm_aesdec_si128(t, *(__m128i*)--key);
432 t = _mm_aesdec_si128(t, *(__m128i*)--key);
433 t = _mm_aesdec_si128(t, *(__m128i*)--key);
434 t = _mm_aesdec_si128(t, *(__m128i*)--key);
435 t = _mm_aesdec_si128(t, *(__m128i*)--key);
436 t = _mm_aesdec_si128(t, *(__m128i*)--key);
437 t = _mm_aesdec_si128(t, *(__m128i*)--key);
438 t = _mm_aesdec_si128(t, *(__m128i*)--key);
439 t = _mm_aesdec_si128(t, *(__m128i*)--key);
440 t = _mm_aesdec_si128(t, *(__m128i*)--key);
441 t = _mm_aesdeclast_si128(t, *(__m128i*)--key);
444 _mm_storeu_si128((__m128i*)out, t);
448 #ifdef ADD_AESNI_MODE_CALLS
449 #ifdef USE_AES_CONTEXT
451 AES_RETURN aes_CBC_encrypt(const unsigned char *in,
453 unsigned char ivec[16],
454 unsigned long length,
455 const aes_encrypt_ctx cx[1])
457 __m128i feedback, data, *key = (__m128i*)cx->ks;
458 int number_of_rounds = cx->inf.b[0] >> 4, j;
461 if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
466 return aes_cbc_encrypt(in, out, length, ivec, cx);
470 length = length / 16 + 1;
472 feedback = _mm_loadu_si128((__m128i*)ivec);
473 for(i = 0; i < length; i++)
475 data = _mm_loadu_si128(&((__m128i*)in)[i]);
476 feedback = _mm_xor_si128(data, feedback);
477 feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
478 for(j = 1; j <number_of_rounds; j++)
479 feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
480 feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
481 _mm_storeu_si128(&((__m128i*)out)[i], feedback);
486 AES_RETURN aes_CBC_decrypt(const unsigned char *in,
488 unsigned char ivec[16],
489 unsigned long length,
490 const aes_decrypt_ctx cx[1])
492 __m128i data, feedback, last_in, *key = (__m128i*)cx->ks;
493 int number_of_rounds = cx->inf.b[0] >> 4, j;
496 if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
501 return aes_cbc_decrypt(in, out, length, ivec, cx);
505 length = length / 16 + 1;
507 feedback = _mm_loadu_si128((__m128i*)ivec);
508 for(i = 0; i < length; i++)
510 last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
511 data = _mm_xor_si128(last_in, ((__m128i*)key)[number_of_rounds]);
512 for(j = number_of_rounds - 1; j > 0; j--)
514 data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
516 data = _mm_aesdeclast_si128(data, ((__m128i*)key)[0]);
517 data = _mm_xor_si128(data, feedback);
518 _mm_storeu_si128(&((__m128i*)out)[i], data);
524 static void ctr_inc(unsigned char *ctr_blk)
528 c = *(uint32_t*)(ctr_blk + 8);
530 *(uint32_t*)(ctr_blk + 8) = c;
533 *(uint32_t*)(ctr_blk + 12) = *(uint32_t*)(ctr_blk + 12) + 1;
536 AES_RETURN AES_CTR_encrypt(const unsigned char *in,
538 const unsigned char ivec[8],
539 const unsigned char nonce[4],
540 unsigned long length,
541 const aes_encrypt_ctx cx[1])
543 __m128i ctr_block = { 0 }, *key = (__m128i*)cx->ks, tmp, ONE, BSWAP_EPI64;
544 int number_of_rounds = cx->inf.b[0] >> 4, j;
547 if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
552 unsigned char ctr_blk[16];
553 *(uint64_t*)ctr_blk = *(uint64_t*)ivec;
554 *(uint32_t*)(ctr_blk + 8) = *(uint32_t*)nonce;
555 return aes_ctr_crypt(in, out, length, (unsigned char*)ctr_blk, ctr_inc, cx);
559 length = length / 16 + 1;
561 ONE = _mm_set_epi32(0, 1, 0, 0);
562 BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
564 ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
566 ctr_block = _mm_set_epi64(*(__m64*)ivec, *(__m64*)&ctr_block);
568 ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
569 ctr_block = _mm_srli_si128(ctr_block, 4);
570 ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
571 ctr_block = _mm_add_epi64(ctr_block, ONE);
572 for(i = 0; i < length; i++)
574 tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
575 ctr_block = _mm_add_epi64(ctr_block, ONE);
576 tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
577 for(j = 1; j <number_of_rounds; j++)
579 tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
581 tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
582 tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
583 _mm_storeu_si128(&((__m128i*)out)[i], tmp);
590 void aes_CBC_encrypt(const unsigned char *in,
592 unsigned char ivec[16],
593 unsigned long length,
595 int number_of_rounds)
597 __m128i feedback, data;
601 length = length / 16 + 1;
603 feedback = _mm_loadu_si128((__m128i*)ivec);
604 for(i = 0; i < length; i++)
606 data = _mm_loadu_si128(&((__m128i*)in)[i]);
607 feedback = _mm_xor_si128(data, feedback);
608 feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
609 for(j = 1; j <number_of_rounds; j++)
610 feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
611 feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
612 _mm_storeu_si128(&((__m128i*)out)[i], feedback);
616 void aes_CBC_decrypt(const unsigned char *in,
618 unsigned char ivec[16],
619 unsigned long length,
621 int number_of_rounds)
623 __m128i data, feedback, last_in;
627 length = length / 16 + 1;
629 feedback = _mm_loadu_si128((__m128i*)ivec);
630 for(i = 0; i < length; i++)
632 last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
633 data = _mm_xor_si128(last_in, ((__m128i*)key)[0]);
634 for(j = 1; j <number_of_rounds; j++)
636 data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
638 data = _mm_aesdeclast_si128(data, ((__m128i*)key)[j]);
639 data = _mm_xor_si128(data, feedback);
640 _mm_storeu_si128(&((__m128i*)out)[i], data);
645 void AES_CTR_encrypt(const unsigned char *in,
647 const unsigned char ivec[8],
648 const unsigned char nonce[4],
649 unsigned long length,
650 const unsigned char *key,
651 int number_of_rounds)
653 __m128i ctr_block = { 0 }, tmp, ONE, BSWAP_EPI64;
657 length = length / 16 + 1;
659 ONE = _mm_set_epi32(0, 1, 0, 0);
660 BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
662 ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
664 ctr_block = _mm_set_epi64(*(__m64*)ivec, *(__m64*)&ctr_block);
666 ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
667 ctr_block = _mm_srli_si128(ctr_block, 4);
668 ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
669 ctr_block = _mm_add_epi64(ctr_block, ONE);
670 for(i = 0; i < length; i++)
672 tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
673 ctr_block = _mm_add_epi64(ctr_block, ONE);
674 tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
675 for(j = 1; j <number_of_rounds; j++)
677 tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
679 tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
680 tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
681 _mm_storeu_si128(&((__m128i*)out)[i], tmp);