--- /dev/null
+/*
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+ source code distributions include the above copyright notice, this
+ list of conditions and the following disclaimer;
+
+ binary distributions include the above copyright notice, this list
+ of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 09/09/2014
+*/
+
+#include "aes_ni.h"
+
+#if defined( USE_INTEL_AES_IF_PRESENT )
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+#pragma intrinsic(__cpuid)
+#define INLINE __inline
+
+INLINE int has_aes_ni(void)
+{
+ static int test = -1;
+ if(test < 0)
+ {
+ int cpu_info[4];
+ __cpuid(cpu_info, 1);
+ test = cpu_info[2] & 0x02000000;
+ }
+ return test;
+}
+
+#elif defined( __GNUC__ )
+
+#include <cpuid.h>
+
+#if !defined(__clang__)
+#pragma GCC target ("ssse3")
+#pragma GCC target ("sse4.1")
+#pragma GCC target ("aes")
+#endif
+
+#include <x86intrin.h>
+#define INLINE static __inline
+
+INLINE int has_aes_ni()
+{
+ static int test = -1;
+ if(test < 0)
+ {
+ unsigned int a, b, c, d;
+ if(!__get_cpuid(1, &a, &b, &c, &d))
+ test = 0;
+ else
+ test = (c & 0x2000000);
+ }
+ return test;
+}
+
+#else
+#error AES New Instructions require Microsoft, Intel, GNU C, or CLANG
+#endif
+
+INLINE __m128i aes_128_assist(__m128i t1, __m128i t2)
+{
+ __m128i t3;
+ t2 = _mm_shuffle_epi32(t2, 0xff);
+ t3 = _mm_slli_si128(t1, 0x4);
+ t1 = _mm_xor_si128(t1, t3);
+ t3 = _mm_slli_si128(t3, 0x4);
+ t1 = _mm_xor_si128(t1, t3);
+ t3 = _mm_slli_si128(t3, 0x4);
+ t1 = _mm_xor_si128(t1, t3);
+ t1 = _mm_xor_si128(t1, t2);
+ return t1;
+}
+
+AES_RETURN aes_ni(encrypt_key128)(const unsigned char *key, aes_encrypt_ctx cx[1])
+{
+ __m128i t1, t2;
+ __m128i *ks = (__m128i*)cx->ks;
+
+ if(!has_aes_ni())
+ {
+ return aes_xi(encrypt_key128)(key, cx);
+ }
+
+ t1 = _mm_loadu_si128((__m128i*)key);
+
+ ks[0] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x1);
+ t1 = aes_128_assist(t1, t2);
+ ks[1] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x2);
+ t1 = aes_128_assist(t1, t2);
+ ks[2] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x4);
+ t1 = aes_128_assist(t1, t2);
+ ks[3] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x8);
+ t1 = aes_128_assist(t1, t2);
+ ks[4] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x10);
+ t1 = aes_128_assist(t1, t2);
+ ks[5] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x20);
+ t1 = aes_128_assist(t1, t2);
+ ks[6] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x40);
+ t1 = aes_128_assist(t1, t2);
+ ks[7] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x80);
+ t1 = aes_128_assist(t1, t2);
+ ks[8] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x1b);
+ t1 = aes_128_assist(t1, t2);
+ ks[9] = t1;
+
+ t2 = _mm_aeskeygenassist_si128(t1, 0x36);
+ t1 = aes_128_assist(t1, t2);
+ ks[10] = t1;
+
+ cx->inf.l = 0;
+ cx->inf.b[0] = 10 * 16;
+ return EXIT_SUCCESS;
+}
+
+INLINE void aes_192_assist(__m128i* t1, __m128i * t2, __m128i * t3)
+{
+ __m128i t4;
+ *t2 = _mm_shuffle_epi32(*t2, 0x55);
+ t4 = _mm_slli_si128(*t1, 0x4);
+ *t1 = _mm_xor_si128(*t1, t4);
+ t4 = _mm_slli_si128(t4, 0x4);
+ *t1 = _mm_xor_si128(*t1, t4);
+ t4 = _mm_slli_si128(t4, 0x4);
+ *t1 = _mm_xor_si128(*t1, t4);
+ *t1 = _mm_xor_si128(*t1, *t2);
+ *t2 = _mm_shuffle_epi32(*t1, 0xff);
+ t4 = _mm_slli_si128(*t3, 0x4);
+ *t3 = _mm_xor_si128(*t3, t4);
+ *t3 = _mm_xor_si128(*t3, *t2);
+}
+
+AES_RETURN aes_ni(encrypt_key192)(const unsigned char *key, aes_encrypt_ctx cx[1])
+{
+ __m128i t1, t2, t3;
+ __m128i *ks = (__m128i*)cx->ks;
+
+ if(!has_aes_ni())
+ {
+ return aes_xi(encrypt_key192)(key, cx);
+ }
+
+ t1 = _mm_loadu_si128((__m128i*)key);
+ t3 = _mm_loadu_si128((__m128i*)(key + 16));
+
+ ks[0] = t1;
+ ks[1] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x1);
+ aes_192_assist(&t1, &t2, &t3);
+
+ ks[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[1]), _mm_castsi128_pd(t1), 0));
+ ks[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x2);
+ aes_192_assist(&t1, &t2, &t3);
+ ks[3] = t1;
+ ks[4] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x4);
+ aes_192_assist(&t1, &t2, &t3);
+ ks[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[4]), _mm_castsi128_pd(t1), 0));
+ ks[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x8);
+ aes_192_assist(&t1, &t2, &t3);
+ ks[6] = t1;
+ ks[7] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x10);
+ aes_192_assist(&t1, &t2, &t3);
+ ks[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[7]), _mm_castsi128_pd(t1), 0));
+ ks[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x20);
+ aes_192_assist(&t1, &t2, &t3);
+ ks[9] = t1;
+ ks[10] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x40);
+ aes_192_assist(&t1, &t2, &t3);
+ ks[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[10]), _mm_castsi128_pd(t1), 0));
+ ks[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x80);
+ aes_192_assist(&t1, &t2, &t3);
+ ks[12] = t1;
+
+ cx->inf.l = 0;
+ cx->inf.b[0] = 12 * 16;
+ return EXIT_SUCCESS;
+}
+
+INLINE void aes_256_assist1(__m128i* t1, __m128i * t2)
+{
+ __m128i t4;
+ *t2 = _mm_shuffle_epi32(*t2, 0xff);
+ t4 = _mm_slli_si128(*t1, 0x4);
+ *t1 = _mm_xor_si128(*t1, t4);
+ t4 = _mm_slli_si128(t4, 0x4);
+ *t1 = _mm_xor_si128(*t1, t4);
+ t4 = _mm_slli_si128(t4, 0x4);
+ *t1 = _mm_xor_si128(*t1, t4);
+ *t1 = _mm_xor_si128(*t1, *t2);
+}
+
+INLINE void aes_256_assist2(__m128i* t1, __m128i * t3)
+{
+ __m128i t2, t4;
+ t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
+ t2 = _mm_shuffle_epi32(t4, 0xaa);
+ t4 = _mm_slli_si128(*t3, 0x4);
+ *t3 = _mm_xor_si128(*t3, t4);
+ t4 = _mm_slli_si128(t4, 0x4);
+ *t3 = _mm_xor_si128(*t3, t4);
+ t4 = _mm_slli_si128(t4, 0x4);
+ *t3 = _mm_xor_si128(*t3, t4);
+ *t3 = _mm_xor_si128(*t3, t2);
+}
+
+AES_RETURN aes_ni(encrypt_key256)(const unsigned char *key, aes_encrypt_ctx cx[1])
+{
+ __m128i t1, t2, t3;
+ __m128i *ks = (__m128i*)cx->ks;
+
+ if(!has_aes_ni())
+ {
+ return aes_xi(encrypt_key256)(key, cx);
+ }
+
+ t1 = _mm_loadu_si128((__m128i*)key);
+ t3 = _mm_loadu_si128((__m128i*)(key + 16));
+
+ ks[0] = t1;
+ ks[1] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x01);
+ aes_256_assist1(&t1, &t2);
+ ks[2] = t1;
+ aes_256_assist2(&t1, &t3);
+ ks[3] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x02);
+ aes_256_assist1(&t1, &t2);
+ ks[4] = t1;
+ aes_256_assist2(&t1, &t3);
+ ks[5] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x04);
+ aes_256_assist1(&t1, &t2);
+ ks[6] = t1;
+ aes_256_assist2(&t1, &t3);
+ ks[7] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x08);
+ aes_256_assist1(&t1, &t2);
+ ks[8] = t1;
+ aes_256_assist2(&t1, &t3);
+ ks[9] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x10);
+ aes_256_assist1(&t1, &t2);
+ ks[10] = t1;
+ aes_256_assist2(&t1, &t3);
+ ks[11] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x20);
+ aes_256_assist1(&t1, &t2);
+ ks[12] = t1;
+ aes_256_assist2(&t1, &t3);
+ ks[13] = t3;
+
+ t2 = _mm_aeskeygenassist_si128(t3, 0x40);
+ aes_256_assist1(&t1, &t2);
+ ks[14] = t1;
+
+ cx->inf.l = 0;
+ cx->inf.b[0] = 14 * 16;
+ return EXIT_SUCCESS;
+}
+
+INLINE void enc_to_dec(aes_decrypt_ctx cx[1])
+{
+ __m128i *ks = (__m128i*)cx->ks;
+ int j;
+
+ for( j = 1 ; j < (cx->inf.b[0] >> 4) ; ++j )
+ ks[j] = _mm_aesimc_si128(ks[j]);
+}
+
+AES_RETURN aes_ni(decrypt_key128)(const unsigned char *key, aes_decrypt_ctx cx[1])
+{
+ if(!has_aes_ni())
+ {
+ return aes_xi(decrypt_key128)(key, cx);
+ }
+
+ if(aes_ni(encrypt_key128)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
+ {
+ enc_to_dec(cx);
+ return EXIT_SUCCESS;
+ }
+ else
+ return EXIT_FAILURE;
+
+}
+
+AES_RETURN aes_ni(decrypt_key192)(const unsigned char *key, aes_decrypt_ctx cx[1])
+{
+ if(!has_aes_ni())
+ {
+ return aes_xi(decrypt_key192)(key, cx);
+ }
+
+ if(aes_ni(encrypt_key192)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
+ {
+ enc_to_dec(cx);
+ return EXIT_SUCCESS;
+ }
+ else
+ return EXIT_FAILURE;
+}
+
+AES_RETURN aes_ni(decrypt_key256)(const unsigned char *key, aes_decrypt_ctx cx[1])
+{
+ if(!has_aes_ni())
+ {
+ return aes_xi(decrypt_key256)(key, cx);
+ }
+
+ if(aes_ni(encrypt_key256)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
+ {
+ enc_to_dec(cx);
+ return EXIT_SUCCESS;
+ }
+ else
+ return EXIT_FAILURE;
+}
+
+AES_RETURN aes_ni(encrypt)(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
+{
+ __m128i *key = (__m128i*)cx->ks, t;
+
+ if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
+ return EXIT_FAILURE;
+
+ if(!has_aes_ni())
+ {
+ return aes_xi(encrypt)(in, out, cx);
+ }
+
+ t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
+
+ switch(cx->inf.b[0])
+ {
+ case 14 * 16:
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ case 12 * 16:
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ case 10 * 16:
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenc_si128(t, *(__m128i*)++key);
+ t = _mm_aesenclast_si128(t, *(__m128i*)++key);
+ }
+
+ _mm_storeu_si128(&((__m128i*)out)[0], t);
+ return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_ni(decrypt)(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
+{
+ __m128i *key = (__m128i*)cx->ks + (cx->inf.b[0] >> 4), t;
+
+ if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
+ return EXIT_FAILURE;
+
+ if(!has_aes_ni())
+ {
+ return aes_xi(decrypt)(in, out, cx);
+ }
+
+ t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
+
+ switch(cx->inf.b[0])
+ {
+ case 14 * 16:
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ case 12 * 16:
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ case 10 * 16:
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdec_si128(t, *(__m128i*)--key);
+ t = _mm_aesdeclast_si128(t, *(__m128i*)--key);
+ }
+
+ _mm_storeu_si128((__m128i*)out, t);
+ return EXIT_SUCCESS;
+}
+
+#ifdef ADD_AESNI_MODE_CALLS
+#ifdef USE_AES_CONTEXT
+
+AES_RETURN aes_CBC_encrypt(const unsigned char *in,
+ unsigned char *out,
+ unsigned char ivec[16],
+ unsigned long length,
+ const aes_encrypt_ctx cx[1])
+{
+ __m128i feedback, data, *key = (__m128i*)cx->ks;
+ int number_of_rounds = cx->inf.b[0] >> 4, j;
+ unsigned long i;
+
+ if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
+ return EXIT_FAILURE;
+
+ if(!has_aes_ni())
+ {
+ return aes_cbc_encrypt(in, out, length, ivec, cx);
+ }
+
+ if(length % 16)
+ length = length / 16 + 1;
+ else length /= 16;
+ feedback = _mm_loadu_si128((__m128i*)ivec);
+ for(i = 0; i < length; i++)
+ {
+ data = _mm_loadu_si128(&((__m128i*)in)[i]);
+ feedback = _mm_xor_si128(data, feedback);
+ feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
+ for(j = 1; j <number_of_rounds; j++)
+ feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
+ feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
+ _mm_storeu_si128(&((__m128i*)out)[i], feedback);
+ }
+ return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_CBC_decrypt(const unsigned char *in,
+ unsigned char *out,
+ unsigned char ivec[16],
+ unsigned long length,
+ const aes_decrypt_ctx cx[1])
+{
+ __m128i data, feedback, last_in, *key = (__m128i*)cx->ks;
+ int number_of_rounds = cx->inf.b[0] >> 4, j;
+ unsigned long i;
+
+ if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
+ return EXIT_FAILURE;
+
+ if(!has_aes_ni())
+ {
+ return aes_cbc_decrypt(in, out, length, ivec, cx);
+ }
+
+ if(length % 16)
+ length = length / 16 + 1;
+ else length /= 16;
+ feedback = _mm_loadu_si128((__m128i*)ivec);
+ for(i = 0; i < length; i++)
+ {
+ last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
+ data = _mm_xor_si128(last_in, ((__m128i*)key)[number_of_rounds]);
+ for(j = number_of_rounds - 1; j > 0; j--)
+ {
+ data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
+ }
+ data = _mm_aesdeclast_si128(data, ((__m128i*)key)[0]);
+ data = _mm_xor_si128(data, feedback);
+ _mm_storeu_si128(&((__m128i*)out)[i], data);
+ feedback = last_in;
+ }
+ return EXIT_SUCCESS;
+}
+
+static void ctr_inc(unsigned char *ctr_blk)
+{
+ uint32_t c;
+
+ c = *(uint32_t*)(ctr_blk + 8);
+ c++;
+ *(uint32_t*)(ctr_blk + 8) = c;
+
+ if(!c)
+ *(uint32_t*)(ctr_blk + 12) = *(uint32_t*)(ctr_blk + 12) + 1;
+}
+
+AES_RETURN AES_CTR_encrypt(const unsigned char *in,
+ unsigned char *out,
+ const unsigned char ivec[8],
+ const unsigned char nonce[4],
+ unsigned long length,
+ const aes_encrypt_ctx cx[1])
+{
+ __m128i ctr_block = { 0 }, *key = (__m128i*)cx->ks, tmp, ONE, BSWAP_EPI64;
+ int number_of_rounds = cx->inf.b[0] >> 4, j;
+ unsigned long i;
+
+ if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
+ return EXIT_FAILURE;
+
+ if(!has_aes_ni())
+ {
+ unsigned char ctr_blk[16];
+ *(uint64_t*)ctr_blk = *(uint64_t*)ivec;
+ *(uint32_t*)(ctr_blk + 8) = *(uint32_t*)nonce;
+ return aes_ctr_crypt(in, out, length, (unsigned char*)ctr_blk, ctr_inc, cx);
+ }
+
+ if(length % 16)
+ length = length / 16 + 1;
+ else length /= 16;
+ ONE = _mm_set_epi32(0, 1, 0, 0);
+ BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+#ifdef _MSC_VER
+ ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
+#else
+ ctr_block = _mm_set_epi64(*(__m64*)ivec, *(__m64*)&ctr_block);
+#endif
+ ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
+ ctr_block = _mm_srli_si128(ctr_block, 4);
+ ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
+ ctr_block = _mm_add_epi64(ctr_block, ONE);
+ for(i = 0; i < length; i++)
+ {
+ tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
+ ctr_block = _mm_add_epi64(ctr_block, ONE);
+ tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
+ for(j = 1; j <number_of_rounds; j++)
+ {
+ tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
+ };
+ tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
+ tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
+ _mm_storeu_si128(&((__m128i*)out)[i], tmp);
+ }
+ return EXIT_SUCCESS;
+}
+
+#else
+
+void aes_CBC_encrypt(const unsigned char *in,
+ unsigned char *out,
+ unsigned char ivec[16],
+ unsigned long length,
+ unsigned char *key,
+ int number_of_rounds)
+{
+ __m128i feedback, data;
+ unsigned long i;
+ int j;
+ if(length % 16)
+ length = length / 16 + 1;
+ else length /= 16;
+ feedback = _mm_loadu_si128((__m128i*)ivec);
+ for(i = 0; i < length; i++)
+ {
+ data = _mm_loadu_si128(&((__m128i*)in)[i]);
+ feedback = _mm_xor_si128(data, feedback);
+ feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
+ for(j = 1; j <number_of_rounds; j++)
+ feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
+ feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
+ _mm_storeu_si128(&((__m128i*)out)[i], feedback);
+ }
+}
+
+void aes_CBC_decrypt(const unsigned char *in,
+ unsigned char *out,
+ unsigned char ivec[16],
+ unsigned long length,
+ unsigned char *key,
+ int number_of_rounds)
+{
+ __m128i data, feedback, last_in;
+ unsigned long i;
+ int j;
+ if(length % 16)
+ length = length / 16 + 1;
+ else length /= 16;
+ feedback = _mm_loadu_si128((__m128i*)ivec);
+ for(i = 0; i < length; i++)
+ {
+ last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
+ data = _mm_xor_si128(last_in, ((__m128i*)key)[0]);
+ for(j = 1; j <number_of_rounds; j++)
+ {
+ data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
+ }
+ data = _mm_aesdeclast_si128(data, ((__m128i*)key)[j]);
+ data = _mm_xor_si128(data, feedback);
+ _mm_storeu_si128(&((__m128i*)out)[i], data);
+ feedback = last_in;
+ }
+}
+
+void AES_CTR_encrypt(const unsigned char *in,
+ unsigned char *out,
+ const unsigned char ivec[8],
+ const unsigned char nonce[4],
+ unsigned long length,
+ const unsigned char *key,
+ int number_of_rounds)
+{
+ __m128i ctr_block = { 0 }, tmp, ONE, BSWAP_EPI64;
+ unsigned long i;
+ int j;
+ if(length % 16)
+ length = length / 16 + 1;
+ else length /= 16;
+ ONE = _mm_set_epi32(0, 1, 0, 0);
+ BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+#ifdef _MSC_VER
+ ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
+#else
+ ctr_block = _mm_set_epi64(*(__m64*)ivec, *(__m64*)&ctr_block);
+#endif
+ ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
+ ctr_block = _mm_srli_si128(ctr_block, 4);
+ ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
+ ctr_block = _mm_add_epi64(ctr_block, ONE);
+ for(i = 0; i < length; i++)
+ {
+ tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
+ ctr_block = _mm_add_epi64(ctr_block, ONE);
+ tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
+ for(j = 1; j <number_of_rounds; j++)
+ {
+ tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
+ };
+ tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
+ tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
+ _mm_storeu_si128(&((__m128i*)out)[i], tmp);
+ }
+}
+#endif
+#endif
+
+#endif