[Python-checkins] cpython: Issue #16113: integrade SHA-3 (Keccak) patch from

christian.heimes python-checkins at python.org
Sat Oct 6 02:36:21 CEST 2012


http://hg.python.org/cpython/rev/11c9a894680e
changeset:   79502:11c9a894680e
user:        Christian Heimes <christian at cheimes.de>
date:        Sat Oct 06 02:23:36 2012 +0200
summary:
  Issue #16113: integrade SHA-3 (Keccak) patch from http://hg.python.org/sandbox/cheimes

files:
  Doc/library/hashlib.rst                            |    10 +-
  Doc/license.rst                                    |    19 +
  Doc/whatsnew/3.4.rst                               |     2 +-
  Lib/hashlib.py                                     |    15 +-
  Lib/test/test_hashlib.py                           |   127 +-
  Modules/_hashopenssl.c                             |    22 -
  Modules/_sha3/cleanup.py                           |    49 +
  Modules/_sha3/keccak/KeccakF-1600-32-rvk.macros    |   555 ++++
  Modules/_sha3/keccak/KeccakF-1600-32-s1.macros     |  1187 ++++++++++
  Modules/_sha3/keccak/KeccakF-1600-32-s2.macros     |  1187 ++++++++++
  Modules/_sha3/keccak/KeccakF-1600-32.macros        |    26 +
  Modules/_sha3/keccak/KeccakF-1600-64.macros        |   728 ++++++
  Modules/_sha3/keccak/KeccakF-1600-int-set.h        |     6 +
  Modules/_sha3/keccak/KeccakF-1600-interface.h      |    46 +
  Modules/_sha3/keccak/KeccakF-1600-opt32-settings.h |     6 +
  Modules/_sha3/keccak/KeccakF-1600-opt32.c          |   524 ++++
  Modules/_sha3/keccak/KeccakF-1600-opt64-settings.h |     9 +
  Modules/_sha3/keccak/KeccakF-1600-opt64.c          |   508 ++++
  Modules/_sha3/keccak/KeccakF-1600-simd128.macros   |   651 +++++
  Modules/_sha3/keccak/KeccakF-1600-simd64.macros    |   517 ++++
  Modules/_sha3/keccak/KeccakF-1600-unrolling.macros |   124 +
  Modules/_sha3/keccak/KeccakF-1600-xop.macros       |   573 ++++
  Modules/_sha3/keccak/KeccakNISTInterface.c         |    83 +
  Modules/_sha3/keccak/KeccakNISTInterface.h         |    72 +
  Modules/_sha3/keccak/KeccakSponge.c                |   266 ++
  Modules/_sha3/keccak/KeccakSponge.h                |    76 +
  Modules/_sha3/keccak/crypto_hash.h                 |     0 
  Modules/_sha3/sha3module.c                         |   569 ++++
  Modules/hashlib.h                                  |    33 +
  setup.py                                           |     9 +
  30 files changed, 7971 insertions(+), 28 deletions(-)


diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst
--- a/Doc/library/hashlib.rst
+++ b/Doc/library/hashlib.rst
@@ -51,9 +51,13 @@
 .. index:: single: OpenSSL; (use in module hashlib)
 
 Constructors for hash algorithms that are always present in this module are
-:func:`md5`, :func:`sha1`, :func:`sha224`, :func:`sha256`, :func:`sha384`, and
-:func:`sha512`.  Additional algorithms may also be available depending upon the
-OpenSSL library that Python uses on your platform.
+:func:`md5`, :func:`sha1`, :func:`sha224`, :func:`sha256`, :func:`sha384`,
+:func:`sha512`, :func:`sha3_224`, :func:`sha3_256`, :func:`sha3_384`, and
+:func:`sha3_512`. Additional algorithms may also be available depending upon
+the OpenSSL library that Python uses on your platform.
+
+   .. versionchanged:: 3.4
+      Add sha3 family of hash algorithms.
 
 For example, to obtain the digest of the byte string ``b'Nobody inspects the
 spammish repetition'``::
diff --git a/Doc/license.rst b/Doc/license.rst
--- a/Doc/license.rst
+++ b/Doc/license.rst
@@ -658,6 +658,25 @@
   SUCH DAMAGE.
 
 
+SHA-3
+-----
+
+The module :mod:`_sha3` and :mod:`hashlib` are using the reference
+implementation of Keccak. The files at :file:`Modules/_sha3/keccak/` contain
+the following note::
+
+  The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+  Michaël Peeters and Gilles Van Assche. For more information, feedback or
+  questions, please refer to our website: http://keccak.noekeon.org/
+
+  Implementation by the designers,
+  hereby denoted as "the implementer".
+
+  To the extent possible under law, the implementer has waived all copyright
+  and related or neighboring rights to the source code in this file.
+  http://creativecommons.org/publicdomain/zero/1.0/
+
+
 strtod and dtoa
 ---------------
 
diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst
@@ -101,7 +101,7 @@
 
 Significantly Improved Library Modules:
 
-* None yet.
+* SHA-3 (Keccak) support for :mod:`hashlib`.
 
 Security improvements:
 
diff --git a/Lib/hashlib.py b/Lib/hashlib.py
--- a/Lib/hashlib.py
+++ b/Lib/hashlib.py
@@ -54,7 +54,8 @@
 
 # This tuple and __get_builtin_constructor() must be modified if a new
 # always available algorithm is added.
-__always_supported = ('md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512')
+__always_supported = ('md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512',
+                      'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512')
 
 algorithms_guaranteed = set(__always_supported)
 algorithms_available = set(__always_supported)
@@ -85,6 +86,18 @@
                 return _sha512.sha512
             elif bs == '384':
                 return _sha512.sha384
+        elif name in {'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512',
+                      'SHA3_224', 'SHA3_256', 'SHA3_384', 'SHA3_512'}:
+            import _sha3
+            bs = name[5:]
+            if bs == '224':
+                return _sha3.sha3_224
+            elif bs == '256':
+                return _sha3.sha3_256
+            elif bs == '384':
+                return _sha3.sha3_384
+            elif bs == '512':
+                return _sha3.sha3_512
     except ImportError:
         pass  # no extension module, this hash is unsupported.
 
diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py
--- a/Lib/test/test_hashlib.py
+++ b/Lib/test/test_hashlib.py
@@ -36,7 +36,10 @@
 class HashLibTestCase(unittest.TestCase):
     supported_hash_names = ( 'md5', 'MD5', 'sha1', 'SHA1',
                              'sha224', 'SHA224', 'sha256', 'SHA256',
-                             'sha384', 'SHA384', 'sha512', 'SHA512' )
+                             'sha384', 'SHA384', 'sha512', 'SHA512',
+                             'sha3_224', 'sha3_256', 'sha3_384',
+                             'sha3_512', 'SHA3_224', 'SHA3_256',
+                             'SHA3_384', 'SHA3_512' )
 
     # Issue #14693: fallback modules are always compiled under POSIX
     _warn_on_extension_import = os.name == 'posix' or COMPILED_WITH_PYDEBUG
@@ -93,6 +96,12 @@
         if _sha512:
             self.constructors_to_test['sha384'].add(_sha512.sha384)
             self.constructors_to_test['sha512'].add(_sha512.sha512)
+        _sha3 = self._conditional_import_module('_sha3')
+        if _sha3:
+            self.constructors_to_test['sha3_224'].add(_sha3.sha3_224)
+            self.constructors_to_test['sha3_256'].add(_sha3.sha3_256)
+            self.constructors_to_test['sha3_384'].add(_sha3.sha3_384)
+            self.constructors_to_test['sha3_512'].add(_sha3.sha3_512)
 
         super(HashLibTestCase, self).__init__(*args, **kwargs)
 
@@ -158,6 +167,7 @@
             self.assertEqual(m1.digest(), m2.digest())
 
     def check(self, name, data, digest):
+        digest = digest.lower()
         constructors = self.constructors_to_test[name]
         # 2 is for hashlib.name(...) and hashlib.new(name, ...)
         self.assertGreaterEqual(len(constructors), 2)
@@ -183,6 +193,10 @@
         self.check_no_unicode('sha256')
         self.check_no_unicode('sha384')
         self.check_no_unicode('sha512')
+        self.check_no_unicode('sha3_224')
+        self.check_no_unicode('sha3_256')
+        self.check_no_unicode('sha3_384')
+        self.check_no_unicode('sha3_512')
 
     def test_case_md5_0(self):
         self.check('md5', b'', 'd41d8cd98f00b204e9800998ecf8427e')
@@ -318,11 +332,122 @@
           "e718483d0ce769644e2e42c7bc15b4638e1f98b13b2044285632a803afa973eb"+
           "de0ff244877ea60a4cb0432ce577c31beb009c5c2c49aa2e4eadb217ad8cc09b")
 
+    # SHA-3 family
+    def test_case_sha3_224_0(self):
+        self.check('sha3_224', b"",
+          "F71837502BA8E10837BDD8D365ADB85591895602FC552B48B7390ABD")
+
+    def test_case_sha3_224_1(self):
+        self.check('sha3_224', bytes.fromhex("CC"),
+          "A9CAB59EB40A10B246290F2D6086E32E3689FAF1D26B470C899F2802")
+
+    def test_case_sha3_224_2(self):
+        self.check('sha3_224', bytes.fromhex("41FB"),
+          "615BA367AFDC35AAC397BC7EB5D58D106A734B24986D5D978FEFD62C")
+
+    def test_case_sha3_224_3(self):
+        self.check('sha3_224', bytes.fromhex(
+            "433C5303131624C0021D868A30825475E8D0BD3052A022180398F4CA4423B9"+
+            "8214B6BEAAC21C8807A2C33F8C93BD42B092CC1B06CEDF3224D5ED1EC29784"+
+            "444F22E08A55AA58542B524B02CD3D5D5F6907AFE71C5D7462224A3F9D9E53"+
+            "E7E0846DCBB4CE"),
+          "62B10F1B6236EBC2DA72957742A8D4E48E213B5F8934604BFD4D2C3A")
+
+    @bigmemtest(size=_4G + 5, memuse=1)
+    def test_case_sha3_224_huge(self, size):
+        if size == _4G + 5:
+            try:
+                self.check('sha3_224', b'A'*size,
+                           '58ef60057c9dddb6a87477e9ace5a26f0d9db01881cf9b10a9f8c224')
+            except OverflowError:
+                pass # 32-bit arch
+
+
+    def test_case_sha3_256_0(self):
+        self.check('sha3_256', b"",
+          "C5D2460186F7233C927E7DB2DCC703C0E500B653CA82273B7BFAD8045D85A470")
+
+    def test_case_sha3_256_1(self):
+        self.check('sha3_256', bytes.fromhex("CC"),
+          "EEAD6DBFC7340A56CAEDC044696A168870549A6A7F6F56961E84A54BD9970B8A")
+
+    def test_case_sha3_256_2(self):
+        self.check('sha3_256', bytes.fromhex("41FB"),
+          "A8EACEDA4D47B3281A795AD9E1EA2122B407BAF9AABCB9E18B5717B7873537D2")
+
+    def test_case_sha3_256_3(self):
+        self.check('sha3_256', bytes.fromhex(
+            "433C5303131624C0021D868A30825475E8D0BD3052A022180398F4CA4423B9"+
+            "8214B6BEAAC21C8807A2C33F8C93BD42B092CC1B06CEDF3224D5ED1EC29784"+
+            "444F22E08A55AA58542B524B02CD3D5D5F6907AFE71C5D7462224A3F9D9E53"+
+            "E7E0846DCBB4CE"),
+          "CE87A5173BFFD92399221658F801D45C294D9006EE9F3F9D419C8D427748DC41")
+
+
+    def test_case_sha3_384_0(self):
+        self.check('sha3_384', b"",
+          "2C23146A63A29ACF99E73B88F8C24EAA7DC60AA771780CCC006AFBFA8FE2479B"+
+          "2DD2B21362337441AC12B515911957FF")
+
+    def test_case_sha3_384_1(self):
+        self.check('sha3_384', bytes.fromhex("CC"),
+          "1B84E62A46E5A201861754AF5DC95C4A1A69CAF4A796AE405680161E29572641"+
+          "F5FA1E8641D7958336EE7B11C58F73E9")
+
+    def test_case_sha3_384_2(self):
+        self.check('sha3_384', bytes.fromhex("41FB"),
+          "495CCE2714CD72C8C53C3363D22C58B55960FE26BE0BF3BBC7A3316DD563AD1D"+
+          "B8410E75EEFEA655E39D4670EC0B1792")
+
+    def test_case_sha3_384_3(self):
+        self.check('sha3_384', bytes.fromhex(
+            "433C5303131624C0021D868A30825475E8D0BD3052A022180398F4CA4423B9"+
+            "8214B6BEAAC21C8807A2C33F8C93BD42B092CC1B06CEDF3224D5ED1EC29784"+
+            "444F22E08A55AA58542B524B02CD3D5D5F6907AFE71C5D7462224A3F9D9E53"+
+            "E7E0846DCBB4CE"),
+          "135114508DD63E279E709C26F7817C0482766CDE49132E3EDF2EEDD8996F4E35"+
+          "96D184100B384868249F1D8B8FDAA2C9")
+
+
+    def test_case_sha3_512_0(self):
+        self.check('sha3_512', b"",
+          "0EAB42DE4C3CEB9235FC91ACFFE746B29C29A8C366B7C60E4E67C466F36A4304"+
+          "C00FA9CAF9D87976BA469BCBE06713B435F091EF2769FB160CDAB33D3670680E")
+
+    def test_case_sha3_512_1(self):
+        self.check('sha3_512', bytes.fromhex("CC"),
+          "8630C13CBD066EA74BBE7FE468FEC1DEE10EDC1254FB4C1B7C5FD69B646E4416"+
+          "0B8CE01D05A0908CA790DFB080F4B513BC3B6225ECE7A810371441A5AC666EB9")
+
+    def test_case_sha3_512_2(self):
+        self.check('sha3_512', bytes.fromhex("41FB"),
+          "551DA6236F8B96FCE9F97F1190E901324F0B45E06DBBB5CDB8355D6ED1DC34B3"+
+          "F0EAE7DCB68622FF232FA3CECE0D4616CDEB3931F93803662A28DF1CD535B731")
+
+    def test_case_sha3_512_3(self):
+        self.check('sha3_512', bytes.fromhex(
+            "433C5303131624C0021D868A30825475E8D0BD3052A022180398F4CA4423B9"+
+            "8214B6BEAAC21C8807A2C33F8C93BD42B092CC1B06CEDF3224D5ED1EC29784"+
+            "444F22E08A55AA58542B524B02CD3D5D5F6907AFE71C5D7462224A3F9D9E53"+
+            "E7E0846DCBB4CE"),
+          "527D28E341E6B14F4684ADB4B824C496C6482E51149565D3D17226828884306B"+
+          "51D6148A72622C2B75F5D3510B799D8BDC03EAEDE453676A6EC8FE03A1AD0EAB")
+
+
     def test_gil(self):
         # Check things work fine with an input larger than the size required
         # for multithreaded operation (which is hardwired to 2048).
         gil_minsize = 2048
 
+        for name in self.supported_hash_names:
+            m = hashlib.new(name)
+            m.update(b'1')
+            m.update(b'#' * gil_minsize)
+            m.update(b'1')
+
+            m = hashlib.new(name, b'x' * gil_minsize)
+            m.update(b'1')
+
         m = hashlib.md5()
         m.update(b'1')
         m.update(b'#' * gil_minsize)
diff --git a/Modules/_hashopenssl.c b/Modules/_hashopenssl.c
--- a/Modules/_hashopenssl.c
+++ b/Modules/_hashopenssl.c
@@ -17,24 +17,6 @@
 #include "structmember.h"
 #include "hashlib.h"
 
-#ifdef WITH_THREAD
-#include "pythread.h"
-    #define ENTER_HASHLIB(obj) \
-        if ((obj)->lock) { \
-            if (!PyThread_acquire_lock((obj)->lock, 0)) { \
-                Py_BEGIN_ALLOW_THREADS \
-                PyThread_acquire_lock((obj)->lock, 1); \
-                Py_END_ALLOW_THREADS \
-            } \
-        }
-    #define LEAVE_HASHLIB(obj) \
-        if ((obj)->lock) { \
-            PyThread_release_lock((obj)->lock); \
-        }
-#else
-    #define ENTER_HASHLIB(obj)
-    #define LEAVE_HASHLIB(obj)
-#endif
 
 /* EVP is the preferred interface to hashing in OpenSSL */
 #include <openssl/evp.h>
@@ -43,10 +25,6 @@
 
 #define MUNCH_SIZE INT_MAX
 
-/* TODO(gps): We should probably make this a module or EVPobject attribute
- * to allow the user to optimize based on the platform they're using. */
-#define HASHLIB_GIL_MINSIZE 2048
-
 #ifndef HASH_OBJ_CONSTRUCTOR
 #define HASH_OBJ_CONSTRUCTOR 0
 #endif
diff --git a/Modules/_sha3/cleanup.py b/Modules/_sha3/cleanup.py
new file mode 100755
--- /dev/null
+++ b/Modules/_sha3/cleanup.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# Copyright (C) 2012   Christian Heimes (christian at python.org)
+# Licensed to PSF under a Contributor Agreement.
+#
+# cleanup Keccak sources
+
+import os
+import re
+
+CPP1 = re.compile("^//(.*)")
+CPP2 = re.compile("\ //(.*)")
+
+STATICS = ("void ", "int ", "HashReturn ", "const UINT64 ", "UINT16 ")
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+KECCAK = os.path.join(HERE, "keccak")
+
+def getfiles():
+    for name in os.listdir(KECCAK):
+        name = os.path.join(KECCAK, name)
+        if os.path.isfile(name):
+            yield name
+
+def cleanup(f):
+    buf = []
+    for line in f:
+        # mark all functions and global data as static
+        if line.startswith(STATICS):
+            buf.append("static " + line)
+            continue
+        # remove UINT64 typedef, we have our own
+        if line.startswith("typedef unsigned long long int"):
+            buf.append("/* %s */\n" % line.strip())
+            continue
+        # remove #include "brg_endian.h"
+        if "brg_endian.h" in line:
+            buf.append("/* %s */\n" % line.strip())
+            continue
+        # transform C++ comments into ANSI C comments
+        line = CPP1.sub(r"/* \1 */", line)
+        line = CPP2.sub(r" /* \1 */", line)
+        buf.append(line)
+    return "".join(buf)
+
+for name in getfiles():
+    with open(name) as f:
+        res = cleanup(f)
+    with open(name, "w") as f:
+        f.write(res)
diff --git a/Modules/_sha3/keccak/KeccakF-1600-32-rvk.macros b/Modules/_sha3/keccak/KeccakF-1600-32-rvk.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-32-rvk.macros
@@ -0,0 +1,555 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by Ronny Van Keer,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+static const UINT32 KeccakF1600RoundConstants_int2[2*24] =
+{
+    0x00000001UL,    0x00000000UL,
+    0x00000000UL,    0x00000089UL,
+    0x00000000UL,    0x8000008bUL,
+    0x00000000UL,    0x80008080UL,
+    0x00000001UL,    0x0000008bUL,
+    0x00000001UL,    0x00008000UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000001UL,    0x80000082UL,
+    0x00000000UL,    0x0000000bUL,
+    0x00000000UL,    0x0000000aUL,
+    0x00000001UL,    0x00008082UL,
+    0x00000000UL,    0x00008003UL,
+    0x00000001UL,    0x0000808bUL,
+    0x00000001UL,    0x8000000bUL,
+    0x00000001UL,    0x8000008aUL,
+    0x00000001UL,    0x80000081UL,
+    0x00000000UL,    0x80000081UL,
+    0x00000000UL,    0x80000008UL,
+    0x00000000UL,    0x00000083UL,
+    0x00000000UL,    0x80008003UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000000UL,    0x80000088UL,
+    0x00000001UL,    0x00008000UL,
+    0x00000000UL,    0x80008082UL
+};
+
+#undef rounds
+
+#define rounds \
+{ \
+    UINT32 Da0, De0, Di0, Do0, Du0; \
+    UINT32 Da1, De1, Di1, Do1, Du1; \
+    UINT32 Ba, Be, Bi, Bo, Bu; \
+    UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \
+    UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \
+    UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \
+    UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \
+    UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \
+    UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \
+    UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \
+    UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \
+    UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \
+    UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \
+    UINT32 Cw, Cx, Cy, Cz; \
+    UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \
+    UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \
+    UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \
+    UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \
+    UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \
+    UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \
+    UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \
+    UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \
+    UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \
+    UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \
+	const UINT32 * pRoundConstants = KeccakF1600RoundConstants_int2; \
+    UINT32 i; \
+\
+    copyFromState(A, state) \
+\
+    for( i = 12; i != 0; --i ) { \
+	    Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
+	    Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+	    Da0 = Cx^ROL32(Du1, 1); \
+	    Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
+	    Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+	    Da1 = Cz^Du0; \
+\
+	    Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
+	    Do0 = Cw^ROL32(Cz, 1); \
+	    Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
+	    Do1 = Cy^Cx; \
+\
+	    Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
+	    De0 = Cx^ROL32(Cy, 1); \
+	    Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
+	    De1 = Cz^Cw; \
+\
+	    Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
+	    Di0 = Du0^ROL32(Cy, 1); \
+	    Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
+	    Di1 = Du1^Cw; \
+\
+	    Du0 = Cw^ROL32(Cz, 1); \
+	    Du1 = Cy^Cx; \
+\
+	    Aba0 ^= Da0; \
+	    Ba = Aba0; \
+	    Age0 ^= De0; \
+	    Be = ROL32(Age0, 22); \
+	    Aki1 ^= Di1; \
+	    Bi = ROL32(Aki1, 22); \
+	    Amo1 ^= Do1; \
+	    Bo = ROL32(Amo1, 11); \
+	    Asu0 ^= Du0; \
+	    Bu = ROL32(Asu0, 7); \
+	    Eba0 =   Ba ^((~Be)&  Bi ) ^ *(pRoundConstants++); \
+	    Ebe0 =   Be ^((~Bi)&  Bo ); \
+	    Ebi0 =   Bi ^((~Bo)&  Bu ); \
+	    Ebo0 =   Bo ^((~Bu)&  Ba ); \
+	    Ebu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abo0 ^= Do0; \
+	    Ba = ROL32(Abo0, 14); \
+	    Agu0 ^= Du0; \
+	    Be = ROL32(Agu0, 10); \
+	    Aka1 ^= Da1; \
+	    Bi = ROL32(Aka1, 2); \
+	    Ame1 ^= De1; \
+	    Bo = ROL32(Ame1, 23); \
+	    Asi1 ^= Di1; \
+	    Bu = ROL32(Asi1, 31); \
+	    Ega0 =   Ba ^((~Be)&  Bi ); \
+	    Ege0 =   Be ^((~Bi)&  Bo ); \
+	    Egi0 =   Bi ^((~Bo)&  Bu ); \
+	    Ego0 =   Bo ^((~Bu)&  Ba ); \
+	    Egu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abe1 ^= De1; \
+	    Ba = ROL32(Abe1, 1); \
+	    Agi0 ^= Di0; \
+	    Be = ROL32(Agi0, 3); \
+	    Ako1 ^= Do1; \
+	    Bi = ROL32(Ako1, 13); \
+	    Amu0 ^= Du0; \
+	    Bo = ROL32(Amu0, 4); \
+	    Asa0 ^= Da0; \
+	    Bu = ROL32(Asa0, 9); \
+	    Eka0 =   Ba ^((~Be)&  Bi ); \
+	    Eke0 =   Be ^((~Bi)&  Bo ); \
+	    Eki0 =   Bi ^((~Bo)&  Bu ); \
+	    Eko0 =   Bo ^((~Bu)&  Ba ); \
+	    Eku0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abu1 ^= Du1; \
+	    Ba = ROL32(Abu1, 14); \
+	    Aga0 ^= Da0; \
+	    Be = ROL32(Aga0, 18); \
+	    Ake0 ^= De0; \
+	    Bi = ROL32(Ake0, 5); \
+	    Ami1 ^= Di1; \
+	    Bo = ROL32(Ami1, 8); \
+	    Aso0 ^= Do0; \
+	    Bu = ROL32(Aso0, 28); \
+	    Ema0 =   Ba ^((~Be)&  Bi ); \
+	    Eme0 =   Be ^((~Bi)&  Bo ); \
+	    Emi0 =   Bi ^((~Bo)&  Bu ); \
+	    Emo0 =   Bo ^((~Bu)&  Ba ); \
+	    Emu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abi0 ^= Di0; \
+	    Ba = ROL32(Abi0, 31); \
+	    Ago1 ^= Do1; \
+	    Be = ROL32(Ago1, 28); \
+	    Aku1 ^= Du1; \
+	    Bi = ROL32(Aku1, 20); \
+	    Ama1 ^= Da1; \
+	    Bo = ROL32(Ama1, 21); \
+	    Ase0 ^= De0; \
+	    Bu = ROL32(Ase0, 1); \
+	    Esa0 =   Ba ^((~Be)&  Bi ); \
+	    Ese0 =   Be ^((~Bi)&  Bo ); \
+	    Esi0 =   Bi ^((~Bo)&  Bu ); \
+	    Eso0 =   Bo ^((~Bu)&  Ba ); \
+	    Esu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Aba1 ^= Da1; \
+	    Ba = Aba1; \
+	    Age1 ^= De1; \
+	    Be = ROL32(Age1, 22); \
+	    Aki0 ^= Di0; \
+	    Bi = ROL32(Aki0, 21); \
+	    Amo0 ^= Do0; \
+	    Bo = ROL32(Amo0, 10); \
+	    Asu1 ^= Du1; \
+	    Bu = ROL32(Asu1, 7); \
+	    Eba1 =   Ba ^((~Be)&  Bi ); \
+	    Eba1 ^= *(pRoundConstants++); \
+	    Ebe1 =   Be ^((~Bi)&  Bo ); \
+	    Ebi1 =   Bi ^((~Bo)&  Bu ); \
+	    Ebo1 =   Bo ^((~Bu)&  Ba ); \
+	    Ebu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abo1 ^= Do1; \
+	    Ba = ROL32(Abo1, 14); \
+	    Agu1 ^= Du1; \
+	    Be = ROL32(Agu1, 10); \
+	    Aka0 ^= Da0; \
+	    Bi = ROL32(Aka0, 1); \
+	    Ame0 ^= De0; \
+	    Bo = ROL32(Ame0, 22); \
+	    Asi0 ^= Di0; \
+	    Bu = ROL32(Asi0, 30); \
+	    Ega1 =   Ba ^((~Be)&  Bi ); \
+	    Ege1 =   Be ^((~Bi)&  Bo ); \
+	    Egi1 =   Bi ^((~Bo)&  Bu ); \
+	    Ego1 =   Bo ^((~Bu)&  Ba ); \
+	    Egu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abe0 ^= De0; \
+	    Ba = Abe0; \
+	    Agi1 ^= Di1; \
+	    Be = ROL32(Agi1, 3); \
+	    Ako0 ^= Do0; \
+	    Bi = ROL32(Ako0, 12); \
+	    Amu1 ^= Du1; \
+	    Bo = ROL32(Amu1, 4); \
+	    Asa1 ^= Da1; \
+	    Bu = ROL32(Asa1, 9); \
+	    Eka1 =   Ba ^((~Be)&  Bi ); \
+	    Eke1 =   Be ^((~Bi)&  Bo ); \
+	    Eki1 =   Bi ^((~Bo)&  Bu ); \
+	    Eko1 =   Bo ^((~Bu)&  Ba ); \
+	    Eku1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abu0 ^= Du0; \
+	    Ba = ROL32(Abu0, 13); \
+	    Aga1 ^= Da1; \
+	    Be = ROL32(Aga1, 18); \
+	    Ake1 ^= De1; \
+	    Bi = ROL32(Ake1, 5); \
+	    Ami0 ^= Di0; \
+	    Bo = ROL32(Ami0, 7); \
+	    Aso1 ^= Do1; \
+	    Bu = ROL32(Aso1, 28); \
+	    Ema1 =   Ba ^((~Be)&  Bi ); \
+	    Eme1 =   Be ^((~Bi)&  Bo ); \
+	    Emi1 =   Bi ^((~Bo)&  Bu ); \
+	    Emo1 =   Bo ^((~Bu)&  Ba ); \
+	    Emu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abi1 ^= Di1; \
+	    Ba = ROL32(Abi1, 31); \
+	    Ago0 ^= Do0; \
+	    Be = ROL32(Ago0, 27); \
+	    Aku0 ^= Du0; \
+	    Bi = ROL32(Aku0, 19); \
+	    Ama0 ^= Da0; \
+	    Bo = ROL32(Ama0, 20); \
+	    Ase1 ^= De1; \
+	    Bu = ROL32(Ase1, 1); \
+	    Esa1 =   Ba ^((~Be)&  Bi ); \
+	    Ese1 =   Be ^((~Bi)&  Bo ); \
+	    Esi1 =   Bi ^((~Bo)&  Bu ); \
+	    Eso1 =   Bo ^((~Bu)&  Ba ); \
+	    Esu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Cx = Ebu0^Egu0^Eku0^Emu0^Esu0; \
+	    Du1 = Ebe1^Ege1^Eke1^Eme1^Ese1; \
+	    Da0 = Cx^ROL32(Du1, 1); \
+	    Cz = Ebu1^Egu1^Eku1^Emu1^Esu1; \
+	    Du0 = Ebe0^Ege0^Eke0^Eme0^Ese0; \
+	    Da1 = Cz^Du0; \
+\
+	    Cw = Ebi0^Egi0^Eki0^Emi0^Esi0; \
+	    Do0 = Cw^ROL32(Cz, 1); \
+	    Cy = Ebi1^Egi1^Eki1^Emi1^Esi1; \
+	    Do1 = Cy^Cx; \
+\
+	    Cx = Eba0^Ega0^Eka0^Ema0^Esa0; \
+	    De0 = Cx^ROL32(Cy, 1); \
+	    Cz = Eba1^Ega1^Eka1^Ema1^Esa1; \
+	    De1 = Cz^Cw; \
+\
+	    Cy = Ebo1^Ego1^Eko1^Emo1^Eso1; \
+	    Di0 = Du0^ROL32(Cy, 1); \
+	    Cw = Ebo0^Ego0^Eko0^Emo0^Eso0; \
+	    Di1 = Du1^Cw; \
+\
+	    Du0 = Cw^ROL32(Cz, 1); \
+	    Du1 = Cy^Cx; \
+\
+	    Eba0 ^= Da0; \
+	    Ba = Eba0; \
+	    Ege0 ^= De0; \
+	    Be = ROL32(Ege0, 22); \
+	    Eki1 ^= Di1; \
+	    Bi = ROL32(Eki1, 22); \
+	    Emo1 ^= Do1; \
+	    Bo = ROL32(Emo1, 11); \
+	    Esu0 ^= Du0; \
+	    Bu = ROL32(Esu0, 7); \
+	    Aba0 =   Ba ^((~Be)&  Bi ); \
+	    Aba0 ^= *(pRoundConstants++); \
+	    Abe0 =   Be ^((~Bi)&  Bo ); \
+	    Abi0 =   Bi ^((~Bo)&  Bu ); \
+	    Abo0 =   Bo ^((~Bu)&  Ba ); \
+	    Abu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebo0 ^= Do0; \
+	    Ba = ROL32(Ebo0, 14); \
+	    Egu0 ^= Du0; \
+	    Be = ROL32(Egu0, 10); \
+	    Eka1 ^= Da1; \
+	    Bi = ROL32(Eka1, 2); \
+	    Eme1 ^= De1; \
+	    Bo = ROL32(Eme1, 23); \
+	    Esi1 ^= Di1; \
+	    Bu = ROL32(Esi1, 31); \
+	    Aga0 =   Ba ^((~Be)&  Bi ); \
+	    Age0 =   Be ^((~Bi)&  Bo ); \
+	    Agi0 =   Bi ^((~Bo)&  Bu ); \
+	    Ago0 =   Bo ^((~Bu)&  Ba ); \
+	    Agu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebe1 ^= De1; \
+	    Ba = ROL32(Ebe1, 1); \
+	    Egi0 ^= Di0; \
+	    Be = ROL32(Egi0, 3); \
+	    Eko1 ^= Do1; \
+	    Bi = ROL32(Eko1, 13); \
+	    Emu0 ^= Du0; \
+	    Bo = ROL32(Emu0, 4); \
+	    Esa0 ^= Da0; \
+	    Bu = ROL32(Esa0, 9); \
+	    Aka0 =   Ba ^((~Be)&  Bi ); \
+	    Ake0 =   Be ^((~Bi)&  Bo ); \
+	    Aki0 =   Bi ^((~Bo)&  Bu ); \
+	    Ako0 =   Bo ^((~Bu)&  Ba ); \
+	    Aku0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebu1 ^= Du1; \
+	    Ba = ROL32(Ebu1, 14); \
+	    Ega0 ^= Da0; \
+	    Be = ROL32(Ega0, 18); \
+	    Eke0 ^= De0; \
+	    Bi = ROL32(Eke0, 5); \
+	    Emi1 ^= Di1; \
+	    Bo = ROL32(Emi1, 8); \
+	    Eso0 ^= Do0; \
+	    Bu = ROL32(Eso0, 28); \
+	    Ama0 =   Ba ^((~Be)&  Bi ); \
+	    Ame0 =   Be ^((~Bi)&  Bo ); \
+	    Ami0 =   Bi ^((~Bo)&  Bu ); \
+	    Amo0 =   Bo ^((~Bu)&  Ba ); \
+	    Amu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebi0 ^= Di0; \
+	    Ba = ROL32(Ebi0, 31); \
+	    Ego1 ^= Do1; \
+	    Be = ROL32(Ego1, 28); \
+	    Eku1 ^= Du1; \
+	    Bi = ROL32(Eku1, 20); \
+	    Ema1 ^= Da1; \
+	    Bo = ROL32(Ema1, 21); \
+	    Ese0 ^= De0; \
+	    Bu = ROL32(Ese0, 1); \
+	    Asa0 =   Ba ^((~Be)&  Bi ); \
+	    Ase0 =   Be ^((~Bi)&  Bo ); \
+	    Asi0 =   Bi ^((~Bo)&  Bu ); \
+	    Aso0 =   Bo ^((~Bu)&  Ba ); \
+	    Asu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Eba1 ^= Da1; \
+	    Ba = Eba1; \
+	    Ege1 ^= De1; \
+	    Be = ROL32(Ege1, 22); \
+	    Eki0 ^= Di0; \
+	    Bi = ROL32(Eki0, 21); \
+	    Emo0 ^= Do0; \
+	    Bo = ROL32(Emo0, 10); \
+	    Esu1 ^= Du1; \
+	    Bu = ROL32(Esu1, 7); \
+	    Aba1 =   Ba ^((~Be)&  Bi ); \
+	    Aba1 ^= *(pRoundConstants++); \
+	    Abe1 =   Be ^((~Bi)&  Bo ); \
+	    Abi1 =   Bi ^((~Bo)&  Bu ); \
+	    Abo1 =   Bo ^((~Bu)&  Ba ); \
+	    Abu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebo1 ^= Do1; \
+	    Ba = ROL32(Ebo1, 14); \
+	    Egu1 ^= Du1; \
+	    Be = ROL32(Egu1, 10); \
+	    Eka0 ^= Da0; \
+	    Bi = ROL32(Eka0, 1); \
+	    Eme0 ^= De0; \
+	    Bo = ROL32(Eme0, 22); \
+	    Esi0 ^= Di0; \
+	    Bu = ROL32(Esi0, 30); \
+	    Aga1 =   Ba ^((~Be)&  Bi ); \
+	    Age1 =   Be ^((~Bi)&  Bo ); \
+	    Agi1 =   Bi ^((~Bo)&  Bu ); \
+	    Ago1 =   Bo ^((~Bu)&  Ba ); \
+	    Agu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebe0 ^= De0; \
+	    Ba = Ebe0; \
+	    Egi1 ^= Di1; \
+	    Be = ROL32(Egi1, 3); \
+	    Eko0 ^= Do0; \
+	    Bi = ROL32(Eko0, 12); \
+	    Emu1 ^= Du1; \
+	    Bo = ROL32(Emu1, 4); \
+	    Esa1 ^= Da1; \
+	    Bu = ROL32(Esa1, 9); \
+	    Aka1 =   Ba ^((~Be)&  Bi ); \
+	    Ake1 =   Be ^((~Bi)&  Bo ); \
+	    Aki1 =   Bi ^((~Bo)&  Bu ); \
+	    Ako1 =   Bo ^((~Bu)&  Ba ); \
+	    Aku1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebu0 ^= Du0; \
+	    Ba = ROL32(Ebu0, 13); \
+	    Ega1 ^= Da1; \
+	    Be = ROL32(Ega1, 18); \
+	    Eke1 ^= De1; \
+	    Bi = ROL32(Eke1, 5); \
+	    Emi0 ^= Di0; \
+	    Bo = ROL32(Emi0, 7); \
+	    Eso1 ^= Do1; \
+	    Bu = ROL32(Eso1, 28); \
+	    Ama1 =   Ba ^((~Be)&  Bi ); \
+	    Ame1 =   Be ^((~Bi)&  Bo ); \
+	    Ami1 =   Bi ^((~Bo)&  Bu ); \
+	    Amo1 =   Bo ^((~Bu)&  Ba ); \
+	    Amu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebi1 ^= Di1; \
+	    Ba = ROL32(Ebi1, 31); \
+	    Ego0 ^= Do0; \
+	    Be = ROL32(Ego0, 27); \
+	    Eku0 ^= Du0; \
+	    Bi = ROL32(Eku0, 19); \
+	    Ema0 ^= Da0; \
+	    Bo = ROL32(Ema0, 20); \
+	    Ese1 ^= De1; \
+	    Bu = ROL32(Ese1, 1); \
+	    Asa1 =   Ba ^((~Be)&  Bi ); \
+	    Ase1 =   Be ^((~Bi)&  Bo ); \
+	    Asi1 =   Bi ^((~Bo)&  Bu ); \
+	    Aso1 =   Bo ^((~Bu)&  Ba ); \
+	    Asu1 =   Bu ^((~Ba)&  Be ); \
+    } \
+    copyToState(state, A) \
+}
+
+#define copyFromState(X, state) \
+    X##ba0 = state[ 0]; \
+    X##ba1 = state[ 1]; \
+    X##be0 = state[ 2]; \
+    X##be1 = state[ 3]; \
+    X##bi0 = state[ 4]; \
+    X##bi1 = state[ 5]; \
+    X##bo0 = state[ 6]; \
+    X##bo1 = state[ 7]; \
+    X##bu0 = state[ 8]; \
+    X##bu1 = state[ 9]; \
+    X##ga0 = state[10]; \
+    X##ga1 = state[11]; \
+    X##ge0 = state[12]; \
+    X##ge1 = state[13]; \
+    X##gi0 = state[14]; \
+    X##gi1 = state[15]; \
+    X##go0 = state[16]; \
+    X##go1 = state[17]; \
+    X##gu0 = state[18]; \
+    X##gu1 = state[19]; \
+    X##ka0 = state[20]; \
+    X##ka1 = state[21]; \
+    X##ke0 = state[22]; \
+    X##ke1 = state[23]; \
+    X##ki0 = state[24]; \
+    X##ki1 = state[25]; \
+    X##ko0 = state[26]; \
+    X##ko1 = state[27]; \
+    X##ku0 = state[28]; \
+    X##ku1 = state[29]; \
+    X##ma0 = state[30]; \
+    X##ma1 = state[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba0; \
+    state[ 1] = X##ba1; \
+    state[ 2] = X##be0; \
+    state[ 3] = X##be1; \
+    state[ 4] = X##bi0; \
+    state[ 5] = X##bi1; \
+    state[ 6] = X##bo0; \
+    state[ 7] = X##bo1; \
+    state[ 8] = X##bu0; \
+    state[ 9] = X##bu1; \
+    state[10] = X##ga0; \
+    state[11] = X##ga1; \
+    state[12] = X##ge0; \
+    state[13] = X##ge1; \
+    state[14] = X##gi0; \
+    state[15] = X##gi1; \
+    state[16] = X##go0; \
+    state[17] = X##go1; \
+    state[18] = X##gu0; \
+    state[19] = X##gu1; \
+    state[20] = X##ka0; \
+    state[21] = X##ka1; \
+    state[22] = X##ke0; \
+    state[23] = X##ke1; \
+    state[24] = X##ki0; \
+    state[25] = X##ki1; \
+    state[26] = X##ko0; \
+    state[27] = X##ko1; \
+    state[28] = X##ku0; \
+    state[29] = X##ku1; \
+    state[30] = X##ma0; \
+    state[31] = X##ma1; \
+    state[32] = X##me0; \
+    state[33] = X##me1; \
+    state[34] = X##mi0; \
+    state[35] = X##mi1; \
+    state[36] = X##mo0; \
+    state[37] = X##mo1; \
+    state[38] = X##mu0; \
+    state[39] = X##mu1; \
+    state[40] = X##sa0; \
+    state[41] = X##sa1; \
+    state[42] = X##se0; \
+    state[43] = X##se1; \
+    state[44] = X##si0; \
+    state[45] = X##si1; \
+    state[46] = X##so0; \
+    state[47] = X##so1; \
+    state[48] = X##su0; \
+    state[49] = X##su1; \
+
diff --git a/Modules/_sha3/keccak/KeccakF-1600-32-s1.macros b/Modules/_sha3/keccak/KeccakF-1600-32-s1.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-32-s1.macros
@@ -0,0 +1,1187 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \
+    UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \
+    UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \
+    UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \
+    UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \
+    UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \
+    UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \
+    UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \
+    UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \
+    UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \
+    UINT32 Bba0, Bbe0, Bbi0, Bbo0, Bbu0; \
+    UINT32 Bba1, Bbe1, Bbi1, Bbo1, Bbu1; \
+    UINT32 Bga0, Bge0, Bgi0, Bgo0, Bgu0; \
+    UINT32 Bga1, Bge1, Bgi1, Bgo1, Bgu1; \
+    UINT32 Bka0, Bke0, Bki0, Bko0, Bku0; \
+    UINT32 Bka1, Bke1, Bki1, Bko1, Bku1; \
+    UINT32 Bma0, Bme0, Bmi0, Bmo0, Bmu0; \
+    UINT32 Bma1, Bme1, Bmi1, Bmo1, Bmu1; \
+    UINT32 Bsa0, Bse0, Bsi0, Bso0, Bsu0; \
+    UINT32 Bsa1, Bse1, Bsi1, Bso1, Bsu1; \
+    UINT32 Ca0, Ce0, Ci0, Co0, Cu0; \
+    UINT32 Ca1, Ce1, Ci1, Co1, Cu1; \
+    UINT32 Da0, De0, Di0, Do0, Du0; \
+    UINT32 Da1, De1, Di1, Do1, Du1; \
+    UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \
+    UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \
+    UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \
+    UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \
+    UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \
+    UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \
+    UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \
+    UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \
+    UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \
+    UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \
+
+#define prepareTheta \
+    Ca0 = Aba0^Aga0^Aka0^Ama0^Asa0; \
+    Ca1 = Aba1^Aga1^Aka1^Ama1^Asa1; \
+    Ce0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+    Ce1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+    Ci0 = Abi0^Agi0^Aki0^Ami0^Asi0; \
+    Ci1 = Abi1^Agi1^Aki1^Ami1^Asi1; \
+    Co0 = Abo0^Ago0^Ako0^Amo0^Aso0; \
+    Co1 = Abo1^Ago1^Ako1^Amo1^Aso1; \
+    Cu0 = Abu0^Agu0^Aku0^Amu0^Asu0; \
+    Cu1 = Abu1^Agu1^Aku1^Amu1^Asu1; \
+
+#ifdef UseBebigokimisa
+/*  --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    Ce0 = E##be0; \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    Ce1 = E##be1; \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    Ce0 ^= E##se0; \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    Ce1 ^= E##se1; \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+/*  --- Code for round (lane complementing pattern 'bebigokimisa') */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+\
+
+#else /*  UseBebigokimisa */
+/*  --- Code for round, with prepare-theta */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    Ce0 = E##be0; \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    Ce1 = E##be1; \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    Ce0 ^= E##se0; \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    Ce1 ^= E##se1; \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+/*  --- Code for round */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+\
+
+#endif /*  UseBebigokimisa */
+
+const UINT32 KeccakF1600RoundConstants_int2_0[24] = {
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL };
+
+const UINT32 KeccakF1600RoundConstants_int2_1[24] = {
+    0x00000000UL,
+    0x00000089UL,
+    0x8000008bUL,
+    0x80008080UL,
+    0x0000008bUL,
+    0x00008000UL,
+    0x80008088UL,
+    0x80000082UL,
+    0x0000000bUL,
+    0x0000000aUL,
+    0x00008082UL,
+    0x00008003UL,
+    0x0000808bUL,
+    0x8000000bUL,
+    0x8000008aUL,
+    0x80000081UL,
+    0x80000081UL,
+    0x80000008UL,
+    0x00000083UL,
+    0x80008003UL,
+    0x80008088UL,
+    0x80000088UL,
+    0x00008000UL,
+    0x80008082UL };
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]^input[32]; \
+    X##me1 = state[33]^input[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromState(X, state) \
+    X##ba0 = state[ 0]; \
+    X##ba1 = state[ 1]; \
+    X##be0 = state[ 2]; \
+    X##be1 = state[ 3]; \
+    X##bi0 = state[ 4]; \
+    X##bi1 = state[ 5]; \
+    X##bo0 = state[ 6]; \
+    X##bo1 = state[ 7]; \
+    X##bu0 = state[ 8]; \
+    X##bu1 = state[ 9]; \
+    X##ga0 = state[10]; \
+    X##ga1 = state[11]; \
+    X##ge0 = state[12]; \
+    X##ge1 = state[13]; \
+    X##gi0 = state[14]; \
+    X##gi1 = state[15]; \
+    X##go0 = state[16]; \
+    X##go1 = state[17]; \
+    X##gu0 = state[18]; \
+    X##gu1 = state[19]; \
+    X##ka0 = state[20]; \
+    X##ka1 = state[21]; \
+    X##ke0 = state[22]; \
+    X##ke1 = state[23]; \
+    X##ki0 = state[24]; \
+    X##ki1 = state[25]; \
+    X##ko0 = state[26]; \
+    X##ko1 = state[27]; \
+    X##ku0 = state[28]; \
+    X##ku1 = state[29]; \
+    X##ma0 = state[30]; \
+    X##ma1 = state[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba0; \
+    state[ 1] = X##ba1; \
+    state[ 2] = X##be0; \
+    state[ 3] = X##be1; \
+    state[ 4] = X##bi0; \
+    state[ 5] = X##bi1; \
+    state[ 6] = X##bo0; \
+    state[ 7] = X##bo1; \
+    state[ 8] = X##bu0; \
+    state[ 9] = X##bu1; \
+    state[10] = X##ga0; \
+    state[11] = X##ga1; \
+    state[12] = X##ge0; \
+    state[13] = X##ge1; \
+    state[14] = X##gi0; \
+    state[15] = X##gi1; \
+    state[16] = X##go0; \
+    state[17] = X##go1; \
+    state[18] = X##gu0; \
+    state[19] = X##gu1; \
+    state[20] = X##ka0; \
+    state[21] = X##ka1; \
+    state[22] = X##ke0; \
+    state[23] = X##ke1; \
+    state[24] = X##ki0; \
+    state[25] = X##ki1; \
+    state[26] = X##ko0; \
+    state[27] = X##ko1; \
+    state[28] = X##ku0; \
+    state[29] = X##ku1; \
+    state[30] = X##ma0; \
+    state[31] = X##ma1; \
+    state[32] = X##me0; \
+    state[33] = X##me1; \
+    state[34] = X##mi0; \
+    state[35] = X##mi1; \
+    state[36] = X##mo0; \
+    state[37] = X##mo1; \
+    state[38] = X##mu0; \
+    state[39] = X##mu1; \
+    state[40] = X##sa0; \
+    state[41] = X##sa1; \
+    state[42] = X##se0; \
+    state[43] = X##se1; \
+    state[44] = X##si0; \
+    state[45] = X##si1; \
+    state[46] = X##so0; \
+    state[47] = X##so1; \
+    state[48] = X##su0; \
+    state[49] = X##su1; \
+
+#define copyStateVariables(X, Y) \
+    X##ba0 = Y##ba0; \
+    X##ba1 = Y##ba1; \
+    X##be0 = Y##be0; \
+    X##be1 = Y##be1; \
+    X##bi0 = Y##bi0; \
+    X##bi1 = Y##bi1; \
+    X##bo0 = Y##bo0; \
+    X##bo1 = Y##bo1; \
+    X##bu0 = Y##bu0; \
+    X##bu1 = Y##bu1; \
+    X##ga0 = Y##ga0; \
+    X##ga1 = Y##ga1; \
+    X##ge0 = Y##ge0; \
+    X##ge1 = Y##ge1; \
+    X##gi0 = Y##gi0; \
+    X##gi1 = Y##gi1; \
+    X##go0 = Y##go0; \
+    X##go1 = Y##go1; \
+    X##gu0 = Y##gu0; \
+    X##gu1 = Y##gu1; \
+    X##ka0 = Y##ka0; \
+    X##ka1 = Y##ka1; \
+    X##ke0 = Y##ke0; \
+    X##ke1 = Y##ke1; \
+    X##ki0 = Y##ki0; \
+    X##ki1 = Y##ki1; \
+    X##ko0 = Y##ko0; \
+    X##ko1 = Y##ko1; \
+    X##ku0 = Y##ku0; \
+    X##ku1 = Y##ku1; \
+    X##ma0 = Y##ma0; \
+    X##ma1 = Y##ma1; \
+    X##me0 = Y##me0; \
+    X##me1 = Y##me1; \
+    X##mi0 = Y##mi0; \
+    X##mi1 = Y##mi1; \
+    X##mo0 = Y##mo0; \
+    X##mo1 = Y##mo1; \
+    X##mu0 = Y##mu0; \
+    X##mu1 = Y##mu1; \
+    X##sa0 = Y##sa0; \
+    X##sa1 = Y##sa1; \
+    X##se0 = Y##se0; \
+    X##se1 = Y##se1; \
+    X##si0 = Y##si0; \
+    X##si1 = Y##si1; \
+    X##so0 = Y##so0; \
+    X##so1 = Y##so1; \
+    X##su0 = Y##su0; \
+    X##su1 = Y##su1; \
+
diff --git a/Modules/_sha3/keccak/KeccakF-1600-32-s2.macros b/Modules/_sha3/keccak/KeccakF-1600-32-s2.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-32-s2.macros
@@ -0,0 +1,1187 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \
+    UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \
+    UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \
+    UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \
+    UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \
+    UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \
+    UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \
+    UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \
+    UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \
+    UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \
+    UINT32 Bba0, Bbe0, Bbi0, Bbo0, Bbu0; \
+    UINT32 Bba1, Bbe1, Bbi1, Bbo1, Bbu1; \
+    UINT32 Bga0, Bge0, Bgi0, Bgo0, Bgu0; \
+    UINT32 Bga1, Bge1, Bgi1, Bgo1, Bgu1; \
+    UINT32 Bka0, Bke0, Bki0, Bko0, Bku0; \
+    UINT32 Bka1, Bke1, Bki1, Bko1, Bku1; \
+    UINT32 Bma0, Bme0, Bmi0, Bmo0, Bmu0; \
+    UINT32 Bma1, Bme1, Bmi1, Bmo1, Bmu1; \
+    UINT32 Bsa0, Bse0, Bsi0, Bso0, Bsu0; \
+    UINT32 Bsa1, Bse1, Bsi1, Bso1, Bsu1; \
+    UINT32 Ca0, Ce0, Ci0, Co0, Cu0; \
+    UINT32 Ca1, Ce1, Ci1, Co1, Cu1; \
+    UINT32 Da0, De0, Di0, Do0, Du0; \
+    UINT32 Da1, De1, Di1, Do1, Du1; \
+    UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \
+    UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \
+    UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \
+    UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \
+    UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \
+    UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \
+    UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \
+    UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \
+    UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \
+    UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \
+
+#define prepareTheta \
+    Ca0 = Aba0^Aga0^Aka0^Ama0^Asa0; \
+    Ca1 = Aba1^Aga1^Aka1^Ama1^Asa1; \
+    Ce0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+    Ce1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+    Ci0 = Abi0^Agi0^Aki0^Ami0^Asi0; \
+    Ci1 = Abi1^Agi1^Aki1^Ami1^Asi1; \
+    Co0 = Abo0^Ago0^Ako0^Amo0^Aso0; \
+    Co1 = Abo1^Ago1^Ako1^Amo1^Aso1; \
+    Cu0 = Abu0^Agu0^Aku0^Amu0^Asu0; \
+    Cu1 = Abu1^Agu1^Aku1^Amu1^Asu1; \
+
+#ifdef UseBebigokimisa
+/*  --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    Ce0 = E##be0; \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    Ce1 = E##be1; \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    Ce0 ^= E##se0; \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    Ce1 ^= E##se1; \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+/*  --- Code for round (lane complementing pattern 'bebigokimisa') */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+\
+
+#else /*  UseBebigokimisa */
+/*  --- Code for round, with prepare-theta */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    Ce0 = E##be0; \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    Ce1 = E##be1; \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    Ce0 ^= E##se0; \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    Ce1 ^= E##se1; \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+/*  --- Code for round */
+/*  --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+\
+
+#endif /*  UseBebigokimisa */
+
+const UINT32 KeccakF1600RoundConstants_int2_0[24] = {
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL };
+
+const UINT32 KeccakF1600RoundConstants_int2_1[24] = {
+    0x00000000UL,
+    0x00000089UL,
+    0x8000008bUL,
+    0x80008080UL,
+    0x0000008bUL,
+    0x00008000UL,
+    0x80008088UL,
+    0x80000082UL,
+    0x0000000bUL,
+    0x0000000aUL,
+    0x00008082UL,
+    0x00008003UL,
+    0x0000808bUL,
+    0x8000000bUL,
+    0x8000008aUL,
+    0x80000081UL,
+    0x80000081UL,
+    0x80000008UL,
+    0x00000083UL,
+    0x80008003UL,
+    0x80008088UL,
+    0x80000088UL,
+    0x00008000UL,
+    0x80008082UL };
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]^input[32]; \
+    X##me1 = state[33]^input[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromState(X, state) \
+    X##ba0 = state[ 0]; \
+    X##ba1 = state[ 1]; \
+    X##be0 = state[ 2]; \
+    X##be1 = state[ 3]; \
+    X##bi0 = state[ 4]; \
+    X##bi1 = state[ 5]; \
+    X##bo0 = state[ 6]; \
+    X##bo1 = state[ 7]; \
+    X##bu0 = state[ 8]; \
+    X##bu1 = state[ 9]; \
+    X##ga0 = state[10]; \
+    X##ga1 = state[11]; \
+    X##ge0 = state[12]; \
+    X##ge1 = state[13]; \
+    X##gi0 = state[14]; \
+    X##gi1 = state[15]; \
+    X##go0 = state[16]; \
+    X##go1 = state[17]; \
+    X##gu0 = state[18]; \
+    X##gu1 = state[19]; \
+    X##ka0 = state[20]; \
+    X##ka1 = state[21]; \
+    X##ke0 = state[22]; \
+    X##ke1 = state[23]; \
+    X##ki0 = state[24]; \
+    X##ki1 = state[25]; \
+    X##ko0 = state[26]; \
+    X##ko1 = state[27]; \
+    X##ku0 = state[28]; \
+    X##ku1 = state[29]; \
+    X##ma0 = state[30]; \
+    X##ma1 = state[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba0; \
+    state[ 1] = X##ba1; \
+    state[ 2] = X##be0; \
+    state[ 3] = X##be1; \
+    state[ 4] = X##bi0; \
+    state[ 5] = X##bi1; \
+    state[ 6] = X##bo0; \
+    state[ 7] = X##bo1; \
+    state[ 8] = X##bu0; \
+    state[ 9] = X##bu1; \
+    state[10] = X##ga0; \
+    state[11] = X##ga1; \
+    state[12] = X##ge0; \
+    state[13] = X##ge1; \
+    state[14] = X##gi0; \
+    state[15] = X##gi1; \
+    state[16] = X##go0; \
+    state[17] = X##go1; \
+    state[18] = X##gu0; \
+    state[19] = X##gu1; \
+    state[20] = X##ka0; \
+    state[21] = X##ka1; \
+    state[22] = X##ke0; \
+    state[23] = X##ke1; \
+    state[24] = X##ki0; \
+    state[25] = X##ki1; \
+    state[26] = X##ko0; \
+    state[27] = X##ko1; \
+    state[28] = X##ku0; \
+    state[29] = X##ku1; \
+    state[30] = X##ma0; \
+    state[31] = X##ma1; \
+    state[32] = X##me0; \
+    state[33] = X##me1; \
+    state[34] = X##mi0; \
+    state[35] = X##mi1; \
+    state[36] = X##mo0; \
+    state[37] = X##mo1; \
+    state[38] = X##mu0; \
+    state[39] = X##mu1; \
+    state[40] = X##sa0; \
+    state[41] = X##sa1; \
+    state[42] = X##se0; \
+    state[43] = X##se1; \
+    state[44] = X##si0; \
+    state[45] = X##si1; \
+    state[46] = X##so0; \
+    state[47] = X##so1; \
+    state[48] = X##su0; \
+    state[49] = X##su1; \
+
+#define copyStateVariables(X, Y) \
+    X##ba0 = Y##ba0; \
+    X##ba1 = Y##ba1; \
+    X##be0 = Y##be0; \
+    X##be1 = Y##be1; \
+    X##bi0 = Y##bi0; \
+    X##bi1 = Y##bi1; \
+    X##bo0 = Y##bo0; \
+    X##bo1 = Y##bo1; \
+    X##bu0 = Y##bu0; \
+    X##bu1 = Y##bu1; \
+    X##ga0 = Y##ga0; \
+    X##ga1 = Y##ga1; \
+    X##ge0 = Y##ge0; \
+    X##ge1 = Y##ge1; \
+    X##gi0 = Y##gi0; \
+    X##gi1 = Y##gi1; \
+    X##go0 = Y##go0; \
+    X##go1 = Y##go1; \
+    X##gu0 = Y##gu0; \
+    X##gu1 = Y##gu1; \
+    X##ka0 = Y##ka0; \
+    X##ka1 = Y##ka1; \
+    X##ke0 = Y##ke0; \
+    X##ke1 = Y##ke1; \
+    X##ki0 = Y##ki0; \
+    X##ki1 = Y##ki1; \
+    X##ko0 = Y##ko0; \
+    X##ko1 = Y##ko1; \
+    X##ku0 = Y##ku0; \
+    X##ku1 = Y##ku1; \
+    X##ma0 = Y##ma0; \
+    X##ma1 = Y##ma1; \
+    X##me0 = Y##me0; \
+    X##me1 = Y##me1; \
+    X##mi0 = Y##mi0; \
+    X##mi1 = Y##mi1; \
+    X##mo0 = Y##mo0; \
+    X##mo1 = Y##mo1; \
+    X##mu0 = Y##mu0; \
+    X##mu1 = Y##mu1; \
+    X##sa0 = Y##sa0; \
+    X##sa1 = Y##sa1; \
+    X##se0 = Y##se0; \
+    X##se1 = Y##se1; \
+    X##si0 = Y##si0; \
+    X##si1 = Y##si1; \
+    X##so0 = Y##so0; \
+    X##so1 = Y##so1; \
+    X##su0 = Y##su0; \
+    X##su1 = Y##su1; \
+
diff --git a/Modules/_sha3/keccak/KeccakF-1600-32.macros b/Modules/_sha3/keccak/KeccakF-1600-32.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-32.macros
@@ -0,0 +1,26 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifdef UseSchedule
+    #if (UseSchedule == 1)
+        #include "KeccakF-1600-32-s1.macros"
+    #elif (UseSchedule == 2)
+        #include "KeccakF-1600-32-s2.macros"
+    #elif (UseSchedule == 3)
+        #include "KeccakF-1600-32-rvk.macros"
+    #else
+        #error "This schedule is not supported."
+    #endif
+#else
+    #include "KeccakF-1600-32-s1.macros"
+#endif
diff --git a/Modules/_sha3/keccak/KeccakF-1600-64.macros b/Modules/_sha3/keccak/KeccakF-1600-64.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-64.macros
@@ -0,0 +1,728 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT64 Aba, Abe, Abi, Abo, Abu; \
+    UINT64 Aga, Age, Agi, Ago, Agu; \
+    UINT64 Aka, Ake, Aki, Ako, Aku; \
+    UINT64 Ama, Ame, Ami, Amo, Amu; \
+    UINT64 Asa, Ase, Asi, Aso, Asu; \
+    UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    UINT64 Bga, Bge, Bgi, Bgo, Bgu; \
+    UINT64 Bka, Bke, Bki, Bko, Bku; \
+    UINT64 Bma, Bme, Bmi, Bmo, Bmu; \
+    UINT64 Bsa, Bse, Bsi, Bso, Bsu; \
+    UINT64 Ca, Ce, Ci, Co, Cu; \
+    UINT64 Da, De, Di, Do, Du; \
+    UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    UINT64 Ega, Ege, Egi, Ego, Egu; \
+    UINT64 Eka, Eke, Eki, Eko, Eku; \
+    UINT64 Ema, Eme, Emi, Emo, Emu; \
+    UINT64 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = Aba^Aga^Aka^Ama^Asa; \
+    Ce = Abe^Age^Ake^Ame^Ase; \
+    Ci = Abi^Agi^Aki^Ami^Asi; \
+    Co = Abo^Ago^Ako^Amo^Aso; \
+    Cu = Abu^Agu^Aku^Amu^Asu; \
+
+#ifdef UseBebigokimisa
+/*  --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
+/*  --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+    Cu ^= E##su; \
+\
+
+/*  --- Code for round (lane complementing pattern 'bebigokimisa') */
+/*  --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+\
+
+#else /*  UseBebigokimisa */
+/*  --- Code for round, with prepare-theta */
+/*  --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+    Cu ^= E##su; \
+\
+
+/*  --- Code for round */
+/*  --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+\
+
+#endif /*  UseBebigokimisa */
+
+static const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]; \
+    X##ka = state[10]; \
+    X##ke = state[11]; \
+    X##ki = state[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]^input[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]^input[16]; \
+    X##mi = state[17]^input[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]^input[16]; \
+    X##mi = state[17]^input[17]; \
+    X##mo = state[18]^input[18]; \
+    X##mu = state[19]^input[19]; \
+    X##sa = state[20]^input[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromState(X, state) \
+    X##ba = state[ 0]; \
+    X##be = state[ 1]; \
+    X##bi = state[ 2]; \
+    X##bo = state[ 3]; \
+    X##bu = state[ 4]; \
+    X##ga = state[ 5]; \
+    X##ge = state[ 6]; \
+    X##gi = state[ 7]; \
+    X##go = state[ 8]; \
+    X##gu = state[ 9]; \
+    X##ka = state[10]; \
+    X##ke = state[11]; \
+    X##ki = state[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba; \
+    state[ 1] = X##be; \
+    state[ 2] = X##bi; \
+    state[ 3] = X##bo; \
+    state[ 4] = X##bu; \
+    state[ 5] = X##ga; \
+    state[ 6] = X##ge; \
+    state[ 7] = X##gi; \
+    state[ 8] = X##go; \
+    state[ 9] = X##gu; \
+    state[10] = X##ka; \
+    state[11] = X##ke; \
+    state[12] = X##ki; \
+    state[13] = X##ko; \
+    state[14] = X##ku; \
+    state[15] = X##ma; \
+    state[16] = X##me; \
+    state[17] = X##mi; \
+    state[18] = X##mo; \
+    state[19] = X##mu; \
+    state[20] = X##sa; \
+    state[21] = X##se; \
+    state[22] = X##si; \
+    state[23] = X##so; \
+    state[24] = X##su; \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
diff --git a/Modules/_sha3/keccak/KeccakF-1600-int-set.h b/Modules/_sha3/keccak/KeccakF-1600-int-set.h
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-int-set.h
@@ -0,0 +1,6 @@
+#define ProvideFast576
+#define ProvideFast832
+#define ProvideFast1024
+#define ProvideFast1088
+#define ProvideFast1152
+#define ProvideFast1344
diff --git a/Modules/_sha3/keccak/KeccakF-1600-interface.h b/Modules/_sha3/keccak/KeccakF-1600-interface.h
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-interface.h
@@ -0,0 +1,46 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakPermutationInterface_h_
+#define _KeccakPermutationInterface_h_
+
+#include "KeccakF-1600-int-set.h"
+
+static void KeccakInitialize( void );
+static void KeccakInitializeState(unsigned char *state);
+static void KeccakPermutation(unsigned char *state);
+#ifdef ProvideFast576
+static void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast832
+static void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1024
+static void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1088
+static void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1152
+static void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1344
+static void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data);
+#endif
+static void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount);
+#ifdef ProvideFast1024
+static void KeccakExtract1024bits(const unsigned char *state, unsigned char *data);
+#endif
+static void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount);
+
+#endif
diff --git a/Modules/_sha3/keccak/KeccakF-1600-opt32-settings.h b/Modules/_sha3/keccak/KeccakF-1600-opt32-settings.h
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-opt32-settings.h
@@ -0,0 +1,6 @@
+/*
+#define Unrolling 2
+#define UseBebigokimisa
+#define UseInterleaveTables
+#define UseSchedule 3
+*/
diff --git a/Modules/_sha3/keccak/KeccakF-1600-opt32.c b/Modules/_sha3/keccak/KeccakF-1600-opt32.c
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-opt32.c
@@ -0,0 +1,524 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+/* #include "brg_endian.h" */
+#include "KeccakF-1600-opt32-settings.h"
+#include "KeccakF-1600-interface.h"
+
+typedef unsigned char UINT8;
+typedef unsigned short UINT16;
+typedef unsigned int UINT32;
+/* typedef unsigned long long int UINT64; */
+
+#ifdef UseInterleaveTables
+static int interleaveTablesBuilt = 0;
+static UINT16 interleaveTable[65536];
+static UINT16 deinterleaveTable[65536];
+
+static void buildInterleaveTables()
+{
+    UINT32 i, j;
+    UINT16 x;
+
+    if (!interleaveTablesBuilt) {
+        for(i=0; i<65536; i++) {
+            x = 0;
+            for(j=0; j<16; j++) {
+                if (i & (1 << j))
+                    x |= (1 << (j/2 + 8*(j%2)));
+            }
+            interleaveTable[i] = x;
+            deinterleaveTable[x] = (UINT16)i;
+        }
+        interleaveTablesBuilt = 1;
+    }
+}
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+
+#define xor2bytesIntoInterleavedWords(even, odd, source, j) \
+    i##j = interleaveTable[((const UINT16*)source)[j]]; \
+    ((UINT8*)even)[j] ^= i##j & 0xFF; \
+    ((UINT8*)odd)[j] ^= i##j >> 8;
+
+#define setInterleavedWordsInto2bytes(dest, even, odd, j) \
+    d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \
+    ((UINT16*)dest)[j] = d##j;
+
+#else /*  (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) */
+
+#define xor2bytesIntoInterleavedWords(even, odd, source, j) \
+    i##j = interleaveTable[source[2*j] ^ ((UINT16)source[2*j+1] << 8)]; \
+    *even ^= (i##j & 0xFF) << (j*8); \
+    *odd ^= ((i##j >> 8) & 0xFF) << (j*8);
+
+#define setInterleavedWordsInto2bytes(dest, even, odd, j) \
+    d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \
+    dest[2*j] = d##j & 0xFF; \
+    dest[2*j+1] = d##j >> 8;
+
+#endif /*  Endianness */
+
+static void xor8bytesIntoInterleavedWords(UINT32 *even, UINT32 *odd, const UINT8* source)
+{
+    UINT16 i0, i1, i2, i3;
+
+    xor2bytesIntoInterleavedWords(even, odd, source, 0)
+    xor2bytesIntoInterleavedWords(even, odd, source, 1)
+    xor2bytesIntoInterleavedWords(even, odd, source, 2)
+    xor2bytesIntoInterleavedWords(even, odd, source, 3)
+}
+
+#define xorLanesIntoState(laneCount, state, input) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            xor8bytesIntoInterleavedWords(state+i*2, state+i*2+1, input+i*8); \
+    }
+
+static void setInterleavedWordsInto8bytes(UINT8* dest, UINT32 even, UINT32 odd)
+{
+    UINT16 d0, d1, d2, d3;
+
+    setInterleavedWordsInto2bytes(dest, even, odd, 0)
+    setInterleavedWordsInto2bytes(dest, even, odd, 1)
+    setInterleavedWordsInto2bytes(dest, even, odd, 2)
+    setInterleavedWordsInto2bytes(dest, even, odd, 3)
+}
+
+#define extractLanes(laneCount, state, data) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            setInterleavedWordsInto8bytes(data+i*8, ((UINT32*)state)[i*2], ((UINT32*)state)[i*2+1]); \
+    }
+
+#else /*  No interleaving tables */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+
+/*  Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+#define xorInterleavedLE(rateInLanes, state, input) \
+	{ \
+		const UINT32 * pI = (const UINT32 *)input; \
+		UINT32 * pS = state; \
+		UINT32 t, x0, x1; \
+	    int i; \
+	    for (i = (rateInLanes)-1; i >= 0; --i) \
+		{ \
+			x0 = *(pI++); \
+			t = (x0 ^ (x0 >>  1)) & 0x22222222UL;  x0 = x0 ^ t ^ (t <<  1); \
+			t = (x0 ^ (x0 >>  2)) & 0x0C0C0C0CUL;  x0 = x0 ^ t ^ (t <<  2); \
+			t = (x0 ^ (x0 >>  4)) & 0x00F000F0UL;  x0 = x0 ^ t ^ (t <<  4); \
+			t = (x0 ^ (x0 >>  8)) & 0x0000FF00UL;  x0 = x0 ^ t ^ (t <<  8); \
+ 			x1 = *(pI++); \
+			t = (x1 ^ (x1 >>  1)) & 0x22222222UL;  x1 = x1 ^ t ^ (t <<  1); \
+			t = (x1 ^ (x1 >>  2)) & 0x0C0C0C0CUL;  x1 = x1 ^ t ^ (t <<  2); \
+			t = (x1 ^ (x1 >>  4)) & 0x00F000F0UL;  x1 = x1 ^ t ^ (t <<  4); \
+			t = (x1 ^ (x1 >>  8)) & 0x0000FF00UL;  x1 = x1 ^ t ^ (t <<  8); \
+			*(pS++) ^= (UINT16)x0 | (x1 << 16); \
+			*(pS++) ^= (x0 >> 16) | (x1 & 0xFFFF0000); \
+		} \
+	}
+
+#define xorLanesIntoState(laneCount, state, input) \
+    xorInterleavedLE(laneCount, state, input)
+
+#else /*  (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) */
+
+/*  Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+UINT64 toInterleaving(UINT64 x) 
+{
+   UINT64 t;
+
+   t = (x ^ (x >>  1)) & 0x2222222222222222ULL;  x = x ^ t ^ (t <<  1);
+   t = (x ^ (x >>  2)) & 0x0C0C0C0C0C0C0C0CULL;  x = x ^ t ^ (t <<  2);
+   t = (x ^ (x >>  4)) & 0x00F000F000F000F0ULL;  x = x ^ t ^ (t <<  4);
+   t = (x ^ (x >>  8)) & 0x0000FF000000FF00ULL;  x = x ^ t ^ (t <<  8);
+   t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL;  x = x ^ t ^ (t << 16);
+
+   return x;
+}
+
+static void xor8bytesIntoInterleavedWords(UINT32* evenAndOdd, const UINT8* source)
+{
+    /*  This can be optimized */
+    UINT64 sourceWord =
+        (UINT64)source[0]
+        ^ (((UINT64)source[1]) <<  8)
+        ^ (((UINT64)source[2]) << 16)
+        ^ (((UINT64)source[3]) << 24)
+        ^ (((UINT64)source[4]) << 32)
+        ^ (((UINT64)source[5]) << 40)
+        ^ (((UINT64)source[6]) << 48)
+        ^ (((UINT64)source[7]) << 56);
+    UINT64 evenAndOddWord = toInterleaving(sourceWord);
+    evenAndOdd[0] ^= (UINT32)evenAndOddWord;
+    evenAndOdd[1] ^= (UINT32)(evenAndOddWord >> 32);
+}
+
+#define xorLanesIntoState(laneCount, state, input) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            xor8bytesIntoInterleavedWords(state+i*2, input+i*8); \
+    }
+
+#endif /*  Endianness */
+
+/*  Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+UINT64 fromInterleaving(UINT64 x)
+{
+   UINT64 t;
+
+   t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL;  x = x ^ t ^ (t << 16);
+   t = (x ^ (x >>  8)) & 0x0000FF000000FF00ULL;  x = x ^ t ^ (t <<  8);
+   t = (x ^ (x >>  4)) & 0x00F000F000F000F0ULL;  x = x ^ t ^ (t <<  4);
+   t = (x ^ (x >>  2)) & 0x0C0C0C0C0C0C0C0CULL;  x = x ^ t ^ (t <<  2);
+   t = (x ^ (x >>  1)) & 0x2222222222222222ULL;  x = x ^ t ^ (t <<  1);
+
+   return x;
+}
+
+static void setInterleavedWordsInto8bytes(UINT8* dest, UINT32* evenAndOdd)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    ((UINT64*)dest)[0] = fromInterleaving(*(UINT64*)evenAndOdd);
+#else /*  (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) */
+    /*  This can be optimized */
+    UINT64 evenAndOddWord = (UINT64)evenAndOdd[0] ^ ((UINT64)evenAndOdd[1] << 32);
+    UINT64 destWord = fromInterleaving(evenAndOddWord);
+    dest[0] = destWord & 0xFF;
+    dest[1] = (destWord >> 8) & 0xFF;
+    dest[2] = (destWord >> 16) & 0xFF;
+    dest[3] = (destWord >> 24) & 0xFF;
+    dest[4] = (destWord >> 32) & 0xFF;
+    dest[5] = (destWord >> 40) & 0xFF;
+    dest[6] = (destWord >> 48) & 0xFF;
+    dest[7] = (destWord >> 56) & 0xFF;
+#endif /*  Endianness */
+}
+
+#define extractLanes(laneCount, state, data) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            setInterleavedWordsInto8bytes(data+i*8, (UINT32*)state+i*2); \
+    }
+
+#endif /*  With or without interleaving tables */
+
+#if defined(_MSC_VER)
+#define ROL32(a, offset) _rotl(a, offset)
+#elif (defined (__arm__) && defined(__ARMCC_VERSION))
+#define ROL32(a, offset) __ror(a, 32-(offset))
+#else
+#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset))))
+#endif
+
+#include "KeccakF-1600-unrolling.macros"
+#include "KeccakF-1600-32.macros"
+
+#if (UseSchedule == 3)
+
+#ifdef UseBebigokimisa
+#error "No lane complementing with schedule 3."
+#endif
+
+#if (Unrolling != 2)
+#error "Only unrolling 2 is supported by schedule 3."
+#endif
+
+static void KeccakPermutationOnWords(UINT32 *state)
+{
+    rounds
+}
+
+static void KeccakPermutationOnWordsAfterXoring(UINT32 *state, const UINT8 *input, unsigned int laneCount)
+{
+    xorLanesIntoState(laneCount, state, input)
+    rounds
+}
+
+#ifdef ProvideFast576
+static void KeccakPermutationOnWordsAfterXoring576bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(9, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast832
+static void KeccakPermutationOnWordsAfterXoring832bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(13, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1024
+static void KeccakPermutationOnWordsAfterXoring1024bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(16, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1088
+static void KeccakPermutationOnWordsAfterXoring1088bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(17, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1152
+static void KeccakPermutationOnWordsAfterXoring1152bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(18, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1344
+static void KeccakPermutationOnWordsAfterXoring1344bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(21, state, input)
+    rounds
+}
+#endif
+
+#else /*  (Schedule != 3) */
+
+static void KeccakPermutationOnWords(UINT32 *state)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromState(A, state)
+    rounds
+}
+
+static void KeccakPermutationOnWordsAfterXoring(UINT32 *state, const UINT8 *input, unsigned int laneCount)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(laneCount, state, input)
+    copyFromState(A, state)
+    rounds
+}
+
+#ifdef ProvideFast576
+static void KeccakPermutationOnWordsAfterXoring576bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(9, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast832
+static void KeccakPermutationOnWordsAfterXoring832bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(13, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1024
+static void KeccakPermutationOnWordsAfterXoring1024bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(16, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1088
+static void KeccakPermutationOnWordsAfterXoring1088bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(17, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1152
+static void KeccakPermutationOnWordsAfterXoring1152bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(18, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1344
+static void KeccakPermutationOnWordsAfterXoring1344bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(21, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#endif
+
+static void KeccakInitialize()
+{
+#ifdef UseInterleaveTables
+    buildInterleaveTables();
+#endif
+}
+
+static void KeccakInitializeState(unsigned char *state)
+{
+    memset(state, 0, 200);
+#ifdef UseBebigokimisa
+    ((UINT32*)state)[ 2] = ~(UINT32)0;
+    ((UINT32*)state)[ 3] = ~(UINT32)0;
+    ((UINT32*)state)[ 4] = ~(UINT32)0;
+    ((UINT32*)state)[ 5] = ~(UINT32)0;
+    ((UINT32*)state)[16] = ~(UINT32)0;
+    ((UINT32*)state)[17] = ~(UINT32)0;
+    ((UINT32*)state)[24] = ~(UINT32)0;
+    ((UINT32*)state)[25] = ~(UINT32)0;
+    ((UINT32*)state)[34] = ~(UINT32)0;
+    ((UINT32*)state)[35] = ~(UINT32)0;
+    ((UINT32*)state)[40] = ~(UINT32)0;
+    ((UINT32*)state)[41] = ~(UINT32)0;
+#endif
+}
+
+static void KeccakPermutation(unsigned char *state)
+{
+    /*  We assume the state is always stored as interleaved 32-bit words */
+    KeccakPermutationOnWords((UINT32*)state);
+}
+
+#ifdef ProvideFast576
+static void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring576bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast832
+static void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring832bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1024
+static void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1024bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1088
+static void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1088bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1152
+static void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1152bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1344
+static void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1344bits((UINT32*)state, data);
+}
+#endif
+
+static void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
+{
+    KeccakPermutationOnWordsAfterXoring((UINT32*)state, data, laneCount);
+}
+
+#ifdef ProvideFast1024
+static void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
+{
+    extractLanes(16, state, data)
+#ifdef UseBebigokimisa
+    ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2];
+    ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3];
+    ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4];
+    ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5];
+    ((UINT32*)data)[16] = ~((UINT32*)data)[16];
+    ((UINT32*)data)[17] = ~((UINT32*)data)[17];
+    ((UINT32*)data)[24] = ~((UINT32*)data)[24];
+    ((UINT32*)data)[25] = ~((UINT32*)data)[25];
+#endif
+}
+#endif
+
+static void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+    extractLanes(laneCount, state, data)
+#ifdef UseBebigokimisa
+    if (laneCount > 1) {
+        ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2];
+        ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3];
+        if (laneCount > 2) {
+            ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4];
+            ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5];
+            if (laneCount > 8) {
+                ((UINT32*)data)[16] = ~((UINT32*)data)[16];
+                ((UINT32*)data)[17] = ~((UINT32*)data)[17];
+                if (laneCount > 12) {
+                    ((UINT32*)data)[24] = ~((UINT32*)data)[24];
+                    ((UINT32*)data)[25] = ~((UINT32*)data)[25];
+                    if (laneCount > 17) {
+                        ((UINT32*)data)[34] = ~((UINT32*)data)[34];
+                        ((UINT32*)data)[35] = ~((UINT32*)data)[35];
+                        if (laneCount > 20) {
+                            ((UINT32*)data)[40] = ~((UINT32*)data)[40];
+                            ((UINT32*)data)[41] = ~((UINT32*)data)[41];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
diff --git a/Modules/_sha3/keccak/KeccakF-1600-opt64-settings.h b/Modules/_sha3/keccak/KeccakF-1600-opt64-settings.h
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-opt64-settings.h
@@ -0,0 +1,9 @@
+/*
+#define Unrolling 24
+#define UseBebigokimisa
+#define UseSSE
+#define UseOnlySIMD64
+#define UseMMX
+#define UseSHLD
+#define UseXOP
+*/
diff --git a/Modules/_sha3/keccak/KeccakF-1600-opt64.c b/Modules/_sha3/keccak/KeccakF-1600-opt64.c
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-opt64.c
@@ -0,0 +1,508 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+/* #include "brg_endian.h" */
+#include "KeccakF-1600-opt64-settings.h"
+#include "KeccakF-1600-interface.h"
+
+typedef unsigned char UINT8;
+/* typedef unsigned long long int UINT64; */
+
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+#if defined(UseSSE)
+    #include <x86intrin.h>
+    typedef __m128i V64;
+    typedef __m128i V128;
+    typedef union {
+        V128 v128;
+        UINT64 v64[2];
+    } V6464;
+
+    #define ANDnu64(a, b)       _mm_andnot_si128(a, b)
+    #define LOAD64(a)           _mm_loadl_epi64((const V64 *)&(a))
+    #define CONST64(a)          _mm_loadl_epi64((const V64 *)&(a))
+    #define ROL64(a, o)         _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
+    #define STORE64(a, b)       _mm_storel_epi64((V64 *)&(a), b)
+    #define XOR64(a, b)         _mm_xor_si128(a, b)
+    #define XOReq64(a, b)       a = _mm_xor_si128(a, b)
+    #define SHUFFLEBYTES128(a, b)   _mm_shuffle_epi8(a, b)
+
+    #define ANDnu128(a, b)      _mm_andnot_si128(a, b)
+    #define LOAD6464(a, b)      _mm_set_epi64((__m64)(a), (__m64)(b))
+    #define CONST128(a)         _mm_load_si128((const V128 *)&(a))
+    #define LOAD128(a)          _mm_load_si128((const V128 *)&(a))
+    #define LOAD128u(a)         _mm_loadu_si128((const V128 *)&(a))
+    #define ROL64in128(a, o)    _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
+    #define STORE128(a, b)      _mm_store_si128((V128 *)&(a), b)
+    #define XOR128(a, b)        _mm_xor_si128(a, b)
+    #define XOReq128(a, b)      a = _mm_xor_si128(a, b)
+    #define GET64LOLO(a, b)     _mm_unpacklo_epi64(a, b)
+    #define GET64HIHI(a, b)     _mm_unpackhi_epi64(a, b)
+    #define COPY64HI2LO(a)      _mm_shuffle_epi32(a, 0xEE)
+    #define COPY64LO2HI(a)      _mm_shuffle_epi32(a, 0x44)
+    #define ZERO128()           _mm_setzero_si128()
+
+    #ifdef UseOnlySIMD64
+    #include "KeccakF-1600-simd64.macros"
+    #else
+ALIGN const UINT64 rho8_56[2] = {0x0605040302010007, 0x080F0E0D0C0B0A09};
+    #include "KeccakF-1600-simd128.macros"
+    #endif
+
+    #ifdef UseBebigokimisa
+    #error "UseBebigokimisa cannot be used in combination with UseSSE"
+    #endif
+#elif defined(UseXOP)
+    #include <x86intrin.h>
+    typedef __m128i V64;
+    typedef __m128i V128;
+   
+    #define LOAD64(a)           _mm_loadl_epi64((const V64 *)&(a))
+    #define CONST64(a)          _mm_loadl_epi64((const V64 *)&(a))
+    #define STORE64(a, b)       _mm_storel_epi64((V64 *)&(a), b)
+    #define XOR64(a, b)         _mm_xor_si128(a, b)
+    #define XOReq64(a, b)       a = _mm_xor_si128(a, b)
+
+    #define ANDnu128(a, b)      _mm_andnot_si128(a, b)
+    #define LOAD6464(a, b)      _mm_set_epi64((__m64)(a), (__m64)(b))
+    #define CONST128(a)         _mm_load_si128((const V128 *)&(a))
+    #define LOAD128(a)          _mm_load_si128((const V128 *)&(a))
+    #define LOAD128u(a)         _mm_loadu_si128((const V128 *)&(a))
+    #define STORE128(a, b)      _mm_store_si128((V128 *)&(a), b)
+    #define XOR128(a, b)        _mm_xor_si128(a, b)
+    #define XOReq128(a, b)      a = _mm_xor_si128(a, b)
+    #define ZERO128()           _mm_setzero_si128()
+
+    #define SWAP64(a)           _mm_shuffle_epi32(a, 0x4E)
+    #define GET64LOLO(a, b)     _mm_unpacklo_epi64(a, b)
+    #define GET64HIHI(a, b)     _mm_unpackhi_epi64(a, b)
+    #define GET64LOHI(a, b)     ((__m128i)_mm_blend_pd((__m128d)a, (__m128d)b, 2))
+    #define GET64HILO(a, b)     SWAP64(GET64LOHI(b, a))
+    #define COPY64HI2LO(a)      _mm_shuffle_epi32(a, 0xEE)
+    #define COPY64LO2HI(a)      _mm_shuffle_epi32(a, 0x44)
+ 
+    #define ROL6464same(a, o)   _mm_roti_epi64(a, o)
+    #define ROL6464(a, r1, r2)  _mm_rot_epi64(a, CONST128( rot_##r1##_##r2 ))
+ALIGN const UINT64 rot_0_20[2]  = { 0, 20};
+ALIGN const UINT64 rot_44_3[2]  = {44,  3};
+ALIGN const UINT64 rot_43_45[2] = {43, 45};
+ALIGN const UINT64 rot_21_61[2] = {21, 61};
+ALIGN const UINT64 rot_14_28[2] = {14, 28};
+ALIGN const UINT64 rot_1_36[2]  = { 1, 36};
+ALIGN const UINT64 rot_6_10[2]  = { 6, 10};
+ALIGN const UINT64 rot_25_15[2] = {25, 15};
+ALIGN const UINT64 rot_8_56[2]  = { 8, 56};
+ALIGN const UINT64 rot_18_27[2] = {18, 27};
+ALIGN const UINT64 rot_62_55[2] = {62, 55};
+ALIGN const UINT64 rot_39_41[2] = {39, 41};
+
+#if defined(UseSimulatedXOP)
+    /*  For debugging purposes, when XOP is not available */
+    #undef ROL6464
+    #undef ROL6464same
+    #define ROL6464same(a, o)   _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
+    V128 ROL6464(V128 a, int r0, int r1)
+    {
+        V128 a0 = ROL64(a, r0);
+        V128 a1 = COPY64HI2LO(ROL64(a, r1));
+        return GET64LOLO(a0, a1);
+    }
+#endif
+    
+    #include "KeccakF-1600-xop.macros"
+
+    #ifdef UseBebigokimisa
+    #error "UseBebigokimisa cannot be used in combination with UseXOP"
+    #endif
+#elif defined(UseMMX)
+    #include <mmintrin.h>
+    typedef __m64 V64;
+    #define ANDnu64(a, b)       _mm_andnot_si64(a, b)
+
+    #if (defined(_MSC_VER) || defined (__INTEL_COMPILER))
+        #define LOAD64(a)       *(V64*)&(a)
+        #define CONST64(a)      *(V64*)&(a)
+        #define STORE64(a, b)   *(V64*)&(a) = b
+    #else
+        #define LOAD64(a)       (V64)a
+        #define CONST64(a)      (V64)a
+        #define STORE64(a, b)   a = (UINT64)b
+    #endif
+    #define ROL64(a, o)         _mm_or_si64(_mm_slli_si64(a, o), _mm_srli_si64(a, 64-(o)))
+    #define XOR64(a, b)         _mm_xor_si64(a, b)
+    #define XOReq64(a, b)       a = _mm_xor_si64(a, b)
+
+    #include "KeccakF-1600-simd64.macros"
+
+    #ifdef UseBebigokimisa
+    #error "UseBebigokimisa cannot be used in combination with UseMMX"
+    #endif
+#else
+    #if defined(_MSC_VER)
+    #define ROL64(a, offset) _rotl64(a, offset)
+    #elif defined(UseSHLD)
+      #define ROL64(x,N) ({ \
+        register UINT64 __out; \
+        register UINT64 __in = x; \
+        __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
+        __out; \
+      })
+    #else
+    #define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
+    #endif
+
+    #include "KeccakF-1600-64.macros"
+#endif
+
+#include "KeccakF-1600-unrolling.macros"
+
+static void KeccakPermutationOnWords(UINT64 *state)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromState(A, state)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+
+static void KeccakPermutationOnWordsAfterXoring(UINT64 *state, const UINT64 *input, unsigned int laneCount)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+	unsigned int j;
+
+    for(j=0; j<laneCount; j++)
+        state[j] ^= input[j];	
+    copyFromState(A, state)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+
+#ifdef ProvideFast576
+static void KeccakPermutationOnWordsAfterXoring576bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor576bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast832
+static void KeccakPermutationOnWordsAfterXoring832bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor832bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1024
+static void KeccakPermutationOnWordsAfterXoring1024bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1024bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1088
+static void KeccakPermutationOnWordsAfterXoring1088bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1088bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1152
+static void KeccakPermutationOnWordsAfterXoring1152bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1152bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1344
+static void KeccakPermutationOnWordsAfterXoring1344bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1344bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+static void KeccakInitialize()
+{
+}
+
+static void KeccakInitializeState(unsigned char *state)
+{
+    memset(state, 0, 200);
+#ifdef UseBebigokimisa
+    ((UINT64*)state)[ 1] = ~(UINT64)0;
+    ((UINT64*)state)[ 2] = ~(UINT64)0;
+    ((UINT64*)state)[ 8] = ~(UINT64)0;
+    ((UINT64*)state)[12] = ~(UINT64)0;
+    ((UINT64*)state)[17] = ~(UINT64)0;
+    ((UINT64*)state)[20] = ~(UINT64)0;
+#endif
+}
+
+static void KeccakPermutation(unsigned char *state)
+{
+    /*  We assume the state is always stored as words */
+    KeccakPermutationOnWords((UINT64*)state);
+}
+
+/*
+static void fromBytesToWord(UINT64 *word, const UINT8 *bytes)
+{
+    unsigned int i;
+
+    *word = 0;
+    for(i=0; i<(64/8); i++)
+        *word |= (UINT64)(bytes[i]) << (8*i);
+}
+*/
+
+#ifdef ProvideFast576
+static void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring576bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[9];
+    unsigned int i;
+
+    for(i=0; i<9; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring576bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast832
+static void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring832bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[13];
+    unsigned int i;
+
+    for(i=0; i<13; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring832bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1024
+static void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[16];
+    unsigned int i;
+
+    for(i=0; i<16; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1088
+static void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1088bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[17];
+    unsigned int i;
+
+    for(i=0; i<17; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1088bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1152
+static void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1152bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[18];
+    unsigned int i;
+
+    for(i=0; i<18; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1152bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1344
+static void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1344bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[21];
+    unsigned int i;
+
+    for(i=0; i<21; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1344bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+static void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring((UINT64*)state, (const UINT64*)data, laneCount);
+#else
+    UINT64 dataAsWords[25];
+    unsigned int i;
+
+    for(i=0; i<laneCount; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring((UINT64*)state, dataAsWords, laneCount);
+#endif
+}
+
+/*
+static void fromWordToBytes(UINT8 *bytes, const UINT64 word)
+{
+    unsigned int i;
+
+    for(i=0; i<(64/8); i++)
+        bytes[i] = (word >> (8*i)) & 0xFF;
+}
+*/
+
+#ifdef ProvideFast1024
+static void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    memcpy(data, state, 128);
+#else
+    unsigned int i;
+
+    for(i=0; i<16; i++)
+        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+#endif
+#ifdef UseBebigokimisa
+    ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+    ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+    ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+#endif
+}
+#endif
+
+static void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    memcpy(data, state, laneCount*8);
+#else
+    unsigned int i;
+
+    for(i=0; i<laneCount; i++)
+        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+#endif
+#ifdef UseBebigokimisa
+    if (laneCount > 1) {
+        ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+        if (laneCount > 2) {
+            ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+            if (laneCount > 8) {
+                ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+                if (laneCount > 12) {
+                    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+                    if (laneCount > 17) {
+                        ((UINT64*)data)[17] = ~((UINT64*)data)[17];
+                        if (laneCount > 20) {
+                            ((UINT64*)data)[20] = ~((UINT64*)data)[20];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
diff --git a/Modules/_sha3/keccak/KeccakF-1600-simd128.macros b/Modules/_sha3/keccak/KeccakF-1600-simd128.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-simd128.macros
@@ -0,0 +1,651 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    V6464 Abage, Abegi, Abigo, Abogu, Abuga; \
+    V6464 Akame, Akemi, Akimo, Akomu, Akuma; \
+    V6464 Abae, Abio, Agae, Agio, Akae, Akio, Amae, Amio, Asae, Asio; \
+    V64 Aba, Abe, Abi, Abo, Abu; \
+    V64 Aga, Age, Agi, Ago, Agu; \
+    V64 Aka, Ake, Aki, Ako, Aku; \
+    V64 Ama, Ame, Ami, Amo, Amu; \
+    V64 Asa, Ase, Asi, Aso, Asu; \
+    V128 Bbage, Bbegi, Bbigo, Bbogu, Bbuga; \
+    V128 Bkame, Bkemi, Bkimo, Bkomu, Bkuma; \
+    V64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    V64 Bga, Bge, Bgi, Bgo, Bgu; \
+    V64 Bka, Bke, Bki, Bko, Bku; \
+    V64 Bma, Bme, Bmi, Bmo, Bmu; \
+    V64 Bsa, Bse, Bsi, Bso, Bsu; \
+    V128 Cae, Cei, Cio, Cou, Cua, Dei, Dou; \
+    V64 Ca, Ce, Ci, Co, Cu; \
+    V64 Da, De, Di, Do, Du; \
+    V6464 Ebage, Ebegi, Ebigo, Ebogu, Ebuga; \
+    V6464 Ekame, Ekemi, Ekimo, Ekomu, Ekuma; \
+    V64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    V64 Ega, Ege, Egi, Ego, Egu; \
+    V64 Eka, Eke, Eki, Eko, Eku; \
+    V64 Ema, Eme, Emi, Emo, Emu; \
+    V64 Esa, Ese, Esi, Eso, Esu; \
+    V128 Zero;
+
+#define prepareTheta
+
+#define computeD \
+    Cua = GET64LOLO(Cu, Cae); \
+    Dei = XOR128(Cae, ROL64in128(Cio, 1)); \
+    Dou = XOR128(Cio, ROL64in128(Cua, 1)); \
+    Da = XOR64(Cu, ROL64in128(COPY64HI2LO(Cae), 1)); \
+    De = Dei; \
+    Di = COPY64HI2LO(Dei); \
+    Do = Dou; \
+    Du = COPY64HI2LO(Dou);
+
+/*  --- Theta Rho Pi Chi Iota Prepare-theta */
+/*  --- 64-bit lanes mapped to 64-bit and 128-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    computeD \
+    \
+    A##ba = LOAD64(A##bage.v64[0]); \
+    XOReq64(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq64(A##gu, Du); \
+    Bge = ROL64(A##gu, 20); \
+    Bbage = GET64LOLO(Bba, Bge); \
+    A##ge = LOAD64(A##bage.v64[1]); \
+    XOReq64(A##ge, De); \
+    Bbe = ROL64(A##ge, 44); \
+    A##ka = LOAD64(A##kame.v64[0]); \
+    XOReq64(A##ka, Da); \
+    Bgi = ROL64(A##ka, 3); \
+    Bbegi = GET64LOLO(Bbe, Bgi); \
+    XOReq64(A##ki, Di); \
+    Bbi = ROL64(A##ki, 43); \
+    A##me = LOAD64(A##kame.v64[1]); \
+    XOReq64(A##me, De); \
+    Bgo = ROL64(A##me, 45); \
+    Bbigo = GET64LOLO(Bbi, Bgo); \
+    E##bage.v128 = XOR128(Bbage, ANDnu128(Bbegi, Bbigo)); \
+    XOReq128(E##bage.v128, CONST64(KeccakF1600RoundConstants[i])); \
+    Cae = E##bage.v128; \
+    XOReq64(A##mo, Do); \
+    Bbo = ROL64(A##mo, 21); \
+    XOReq64(A##si, Di); \
+    Bgu = ROL64(A##si, 61); \
+    Bbogu = GET64LOLO(Bbo, Bgu); \
+    E##begi.v128 = XOR128(Bbegi, ANDnu128(Bbigo, Bbogu)); \
+    Cei = E##begi.v128; \
+    XOReq64(A##su, Du); \
+    Bbu = ROL64(A##su, 14); \
+    XOReq64(A##bo, Do); \
+    Bga = ROL64(A##bo, 28); \
+    Bbuga = GET64LOLO(Bbu, Bga); \
+    E##bigo.v128 = XOR128(Bbigo, ANDnu128(Bbogu, Bbuga)); \
+    E##bi = E##bigo.v128; \
+    E##go = GET64HIHI(E##bigo.v128, E##bigo.v128); \
+    Cio = E##bigo.v128; \
+    E##bogu.v128 = XOR128(Bbogu, ANDnu128(Bbuga, Bbage)); \
+    E##bo = E##bogu.v128; \
+    E##gu = GET64HIHI(E##bogu.v128, E##bogu.v128); \
+    Cou = E##bogu.v128; \
+    E##buga.v128 = XOR128(Bbuga, ANDnu128(Bbage, Bbegi)); \
+    E##bu = E##buga.v128; \
+    E##ga = GET64HIHI(E##buga.v128, E##buga.v128); \
+    Cua = E##buga.v128; \
+\
+    A##be = LOAD64(A##begi.v64[0]); \
+    XOReq64(A##be, De); \
+    Bka = ROL64(A##be, 1); \
+    XOReq64(A##ga, Da); \
+    Bme = ROL64(A##ga, 36); \
+    Bkame = GET64LOLO(Bka, Bme); \
+    A##gi = LOAD64(A##begi.v64[1]); \
+    XOReq64(A##gi, Di); \
+    Bke = ROL64(A##gi, 6); \
+    A##ke = LOAD64(A##kemi.v64[0]); \
+    XOReq64(A##ke, De); \
+    Bmi = ROL64(A##ke, 10); \
+    Bkemi = GET64LOLO(Bke, Bmi); \
+    XOReq64(A##ko, Do); \
+    Bki = ROL64(A##ko, 25); \
+    A##mi = LOAD64(A##kemi.v64[1]); \
+    XOReq64(A##mi, Di); \
+    Bmo = ROL64(A##mi, 15); \
+    Bkimo = GET64LOLO(Bki, Bmo); \
+    E##kame.v128 = XOR128(Bkame, ANDnu128(Bkemi, Bkimo)); \
+    XOReq128(Cae, E##kame.v128); \
+    Bkomu = GET64LOLO(XOR64(A##mu, Du), XOR64(A##so, Do)); \
+    Bkomu = SHUFFLEBYTES128(Bkomu, CONST128(rho8_56)); \
+    E##kemi.v128 = XOR128(Bkemi, ANDnu128(Bkimo, Bkomu)); \
+    XOReq128(Cei, E##kemi.v128); \
+    XOReq64(A##sa, Da); \
+    Bku = ROL64(A##sa, 18); \
+    XOReq64(A##bu, Du); \
+    Bma = ROL64(A##bu, 27); \
+    Bkuma = GET64LOLO(Bku, Bma); \
+    E##kimo.v128 = XOR128(Bkimo, ANDnu128(Bkomu, Bkuma)); \
+    E##ki = E##kimo.v128; \
+    E##mo = GET64HIHI(E##kimo.v128, E##kimo.v128); \
+    XOReq128(Cio, E##kimo.v128); \
+    E##komu.v128 = XOR128(Bkomu, ANDnu128(Bkuma, Bkame)); \
+    E##ko = E##komu.v128; \
+    E##mu = GET64HIHI(E##komu.v128, E##komu.v128); \
+    XOReq128(Cou, E##komu.v128); \
+    E##kuma.v128 = XOR128(Bkuma, ANDnu128(Bkame, Bkemi)); \
+    E##ku = E##kuma.v128; \
+    E##ma = GET64HIHI(E##kuma.v128, E##kuma.v128); \
+    XOReq128(Cua, E##kuma.v128); \
+\
+    XOReq64(A##bi, Di); \
+    Bsa = ROL64(A##bi, 62); \
+    XOReq64(A##go, Do); \
+    Bse = ROL64(A##go, 55); \
+    XOReq64(A##ku, Du); \
+    Bsi = ROL64(A##ku, 39); \
+    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
+    Ca = E##sa; \
+    XOReq64(A##ma, Da); \
+    Bso = ROL64(A##ma, 41); \
+    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
+    Ce = E##se; \
+    XOReq128(Cae, GET64LOLO(Ca, Ce)); \
+    XOReq64(A##se, De); \
+    Bsu = ROL64(A##se, 2); \
+    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
+    Ci = E##si; \
+    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
+    Co = E##so; \
+    XOReq128(Cio, GET64LOLO(Ci, Co)); \
+    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
+    Cu = E##su; \
+\
+    Zero = ZERO128(); \
+    XOReq128(Cae, GET64HIHI(Cua, Zero)); \
+    XOReq128(Cae, GET64LOLO(Zero, Cei)); \
+    XOReq128(Cio, GET64HIHI(Cei, Zero)); \
+    XOReq128(Cio, GET64LOLO(Zero, Cou)); \
+    XOReq128(Cua, GET64HIHI(Cou, Zero)); \
+    XOReq64(Cu, Cua); \
+
+/*  --- Theta Rho Pi Chi Iota */
+/*  --- 64-bit lanes mapped to 64-bit and 128-bit words */
+#define thetaRhoPiChiIota(i, A, E) thetaRhoPiChiIotaPrepareTheta(i, A, E)
+
+static const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = LOAD64(state[ 9]); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = LOAD128(state[10]); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = LOAD128(state[12]); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = LOAD64(state[14]); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = LOAD128u(state[15]); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD64(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = LOAD64(state[14]); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = LOAD128u(state[15]); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD64(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD128u(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD128u(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = XOR128(LOAD128u(state[17]), LOAD64(input[17])); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD128u(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = XOR128(LOAD128u(state[17]), LOAD128u(input[17])); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = XOR64(LOAD64(state[19]), LOAD64(input[19])); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = XOR128(LOAD128(state[20]), LOAD64(input[20])); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromState(X, state) \
+    X##bae.v128 = LOAD128(state[ 0]); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = LOAD128(state[ 2]); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = LOAD64(state[ 4]); \
+    Cu = X##bu; \
+    X##gae.v128 = LOAD128u(state[ 5]); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = LOAD128u(state[ 7]); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = LOAD64(state[ 9]); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = LOAD128(state[10]); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = LOAD128(state[12]); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = LOAD64(state[14]); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = LOAD128u(state[15]); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyToState(state, X) \
+    state[ 0] = A##bage.v64[0]; \
+    state[ 1] = A##begi.v64[0]; \
+    STORE64(state[ 2], X##bi); \
+    STORE64(state[ 3], X##bo); \
+    STORE64(state[ 4], X##bu); \
+    STORE64(state[ 5], X##ga); \
+    state[ 6] = A##bage.v64[1]; \
+    state[ 7] = A##begi.v64[1]; \
+    STORE64(state[ 8], X##go); \
+    STORE64(state[ 9], X##gu); \
+    state[10] = X##kame.v64[0]; \
+    state[11] = X##kemi.v64[0]; \
+    STORE64(state[12], X##ki); \
+    STORE64(state[13], X##ko); \
+    STORE64(state[14], X##ku); \
+    STORE64(state[15], X##ma); \
+    state[16] = X##kame.v64[1]; \
+    state[17] = X##kemi.v64[1]; \
+    STORE64(state[18], X##mo); \
+    STORE64(state[19], X##mu); \
+    STORE64(state[20], X##sa); \
+    STORE64(state[21], X##se); \
+    STORE64(state[22], X##si); \
+    STORE64(state[23], X##so); \
+    STORE64(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+    X##bage = Y##bage; \
+    X##begi = Y##begi; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##kame = Y##kame; \
+    X##kemi = Y##kemi; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
diff --git a/Modules/_sha3/keccak/KeccakF-1600-simd64.macros b/Modules/_sha3/keccak/KeccakF-1600-simd64.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-simd64.macros
@@ -0,0 +1,517 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    V64 Aba, Abe, Abi, Abo, Abu; \
+    V64 Aga, Age, Agi, Ago, Agu; \
+    V64 Aka, Ake, Aki, Ako, Aku; \
+    V64 Ama, Ame, Ami, Amo, Amu; \
+    V64 Asa, Ase, Asi, Aso, Asu; \
+    V64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    V64 Bga, Bge, Bgi, Bgo, Bgu; \
+    V64 Bka, Bke, Bki, Bko, Bku; \
+    V64 Bma, Bme, Bmi, Bmo, Bmu; \
+    V64 Bsa, Bse, Bsi, Bso, Bsu; \
+    V64 Ca, Ce, Ci, Co, Cu; \
+    V64 Da, De, Di, Do, Du; \
+    V64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    V64 Ega, Ege, Egi, Ego, Egu; \
+    V64 Eka, Eke, Eki, Eko, Eku; \
+    V64 Ema, Eme, Emi, Emo, Emu; \
+    V64 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = XOR64(Aba, XOR64(Aga, XOR64(Aka, XOR64(Ama, Asa)))); \
+    Ce = XOR64(Abe, XOR64(Age, XOR64(Ake, XOR64(Ame, Ase)))); \
+    Ci = XOR64(Abi, XOR64(Agi, XOR64(Aki, XOR64(Ami, Asi)))); \
+    Co = XOR64(Abo, XOR64(Ago, XOR64(Ako, XOR64(Amo, Aso)))); \
+    Cu = XOR64(Abu, XOR64(Agu, XOR64(Aku, XOR64(Amu, Asu)))); \
+
+/*  --- Code for round, with prepare-theta */
+/*  --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = XOR64(Cu, ROL64(Ce, 1)); \
+    De = XOR64(Ca, ROL64(Ci, 1)); \
+    Di = XOR64(Ce, ROL64(Co, 1)); \
+    Do = XOR64(Ci, ROL64(Cu, 1)); \
+    Du = XOR64(Co, ROL64(Ca, 1)); \
+\
+    XOReq64(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq64(A##ge, De); \
+    Bbe = ROL64(A##ge, 44); \
+    XOReq64(A##ki, Di); \
+    Bbi = ROL64(A##ki, 43); \
+    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
+    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
+    Ca = E##ba; \
+    XOReq64(A##mo, Do); \
+    Bbo = ROL64(A##mo, 21); \
+    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
+    Ce = E##be; \
+    XOReq64(A##su, Du); \
+    Bbu = ROL64(A##su, 14); \
+    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
+    Ci = E##bi; \
+    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
+    Co = E##bo; \
+    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
+    Cu = E##bu; \
+\
+    XOReq64(A##bo, Do); \
+    Bga = ROL64(A##bo, 28); \
+    XOReq64(A##gu, Du); \
+    Bge = ROL64(A##gu, 20); \
+    XOReq64(A##ka, Da); \
+    Bgi = ROL64(A##ka, 3); \
+    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
+    XOReq64(Ca, E##ga); \
+    XOReq64(A##me, De); \
+    Bgo = ROL64(A##me, 45); \
+    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
+    XOReq64(Ce, E##ge); \
+    XOReq64(A##si, Di); \
+    Bgu = ROL64(A##si, 61); \
+    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
+    XOReq64(Ci, E##gi); \
+    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
+    XOReq64(Co, E##go); \
+    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
+    XOReq64(Cu, E##gu); \
+\
+    XOReq64(A##be, De); \
+    Bka = ROL64(A##be, 1); \
+    XOReq64(A##gi, Di); \
+    Bke = ROL64(A##gi, 6); \
+    XOReq64(A##ko, Do); \
+    Bki = ROL64(A##ko, 25); \
+    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
+    XOReq64(Ca, E##ka); \
+    XOReq64(A##mu, Du); \
+    Bko = ROL64(A##mu, 8); \
+    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
+    XOReq64(Ce, E##ke); \
+    XOReq64(A##sa, Da); \
+    Bku = ROL64(A##sa, 18); \
+    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
+    XOReq64(Ci, E##ki); \
+    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
+    XOReq64(Co, E##ko); \
+    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
+    XOReq64(Cu, E##ku); \
+\
+    XOReq64(A##bu, Du); \
+    Bma = ROL64(A##bu, 27); \
+    XOReq64(A##ga, Da); \
+    Bme = ROL64(A##ga, 36); \
+    XOReq64(A##ke, De); \
+    Bmi = ROL64(A##ke, 10); \
+    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
+    XOReq64(Ca, E##ma); \
+    XOReq64(A##mi, Di); \
+    Bmo = ROL64(A##mi, 15); \
+    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
+    XOReq64(Ce, E##me); \
+    XOReq64(A##so, Do); \
+    Bmu = ROL64(A##so, 56); \
+    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
+    XOReq64(Ci, E##mi); \
+    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
+    XOReq64(Co, E##mo); \
+    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
+    XOReq64(Cu, E##mu); \
+\
+    XOReq64(A##bi, Di); \
+    Bsa = ROL64(A##bi, 62); \
+    XOReq64(A##go, Do); \
+    Bse = ROL64(A##go, 55); \
+    XOReq64(A##ku, Du); \
+    Bsi = ROL64(A##ku, 39); \
+    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
+    XOReq64(Ca, E##sa); \
+    XOReq64(A##ma, Da); \
+    Bso = ROL64(A##ma, 41); \
+    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
+    XOReq64(Ce, E##se); \
+    XOReq64(A##se, De); \
+    Bsu = ROL64(A##se, 2); \
+    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
+    XOReq64(Ci, E##si); \
+    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
+    XOReq64(Co, E##so); \
+    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
+    XOReq64(Cu, E##su); \
+\
+
+/*  --- Code for round */
+/*  --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = XOR64(Cu, ROL64(Ce, 1)); \
+    De = XOR64(Ca, ROL64(Ci, 1)); \
+    Di = XOR64(Ce, ROL64(Co, 1)); \
+    Do = XOR64(Ci, ROL64(Cu, 1)); \
+    Du = XOR64(Co, ROL64(Ca, 1)); \
+\
+    XOReq64(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq64(A##ge, De); \
+    Bbe = ROL64(A##ge, 44); \
+    XOReq64(A##ki, Di); \
+    Bbi = ROL64(A##ki, 43); \
+    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
+    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
+    XOReq64(A##mo, Do); \
+    Bbo = ROL64(A##mo, 21); \
+    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
+    XOReq64(A##su, Du); \
+    Bbu = ROL64(A##su, 14); \
+    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
+    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
+    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
+\
+    XOReq64(A##bo, Do); \
+    Bga = ROL64(A##bo, 28); \
+    XOReq64(A##gu, Du); \
+    Bge = ROL64(A##gu, 20); \
+    XOReq64(A##ka, Da); \
+    Bgi = ROL64(A##ka, 3); \
+    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
+    XOReq64(A##me, De); \
+    Bgo = ROL64(A##me, 45); \
+    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
+    XOReq64(A##si, Di); \
+    Bgu = ROL64(A##si, 61); \
+    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
+    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
+    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
+\
+    XOReq64(A##be, De); \
+    Bka = ROL64(A##be, 1); \
+    XOReq64(A##gi, Di); \
+    Bke = ROL64(A##gi, 6); \
+    XOReq64(A##ko, Do); \
+    Bki = ROL64(A##ko, 25); \
+    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
+    XOReq64(A##mu, Du); \
+    Bko = ROL64(A##mu, 8); \
+    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
+    XOReq64(A##sa, Da); \
+    Bku = ROL64(A##sa, 18); \
+    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
+    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
+    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
+\
+    XOReq64(A##bu, Du); \
+    Bma = ROL64(A##bu, 27); \
+    XOReq64(A##ga, Da); \
+    Bme = ROL64(A##ga, 36); \
+    XOReq64(A##ke, De); \
+    Bmi = ROL64(A##ke, 10); \
+    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
+    XOReq64(A##mi, Di); \
+    Bmo = ROL64(A##mi, 15); \
+    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
+    XOReq64(A##so, Do); \
+    Bmu = ROL64(A##so, 56); \
+    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
+    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
+    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
+\
+    XOReq64(A##bi, Di); \
+    Bsa = ROL64(A##bi, 62); \
+    XOReq64(A##go, Do); \
+    Bse = ROL64(A##go, 55); \
+    XOReq64(A##ku, Du); \
+    Bsi = ROL64(A##ku, 39); \
+    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
+    XOReq64(A##ma, Da); \
+    Bso = ROL64(A##ma, 41); \
+    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
+    XOReq64(A##se, De); \
+    Bsu = ROL64(A##se, 2); \
+    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
+    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
+    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
+\
+
+static const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = LOAD64(state[ 9]); \
+    X##ka = LOAD64(state[10]); \
+    X##ke = LOAD64(state[11]); \
+    X##ki = LOAD64(state[12]); \
+    X##ko = LOAD64(state[13]); \
+    X##ku = LOAD64(state[14]); \
+    X##ma = LOAD64(state[15]); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = LOAD64(state[13]); \
+    X##ku = LOAD64(state[14]); \
+    X##ma = LOAD64(state[15]); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##mi = XOR64(LOAD64(state[17]), LOAD64(input[17])); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##mi = XOR64(LOAD64(state[17]), LOAD64(input[17])); \
+    X##mo = XOR64(LOAD64(state[18]), LOAD64(input[18])); \
+    X##mu = XOR64(LOAD64(state[19]), LOAD64(input[19])); \
+    X##sa = XOR64(LOAD64(state[20]), LOAD64(input[20])); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromState(X, state) \
+    X##ba = LOAD64(state[ 0]); \
+    X##be = LOAD64(state[ 1]); \
+    X##bi = LOAD64(state[ 2]); \
+    X##bo = LOAD64(state[ 3]); \
+    X##bu = LOAD64(state[ 4]); \
+    X##ga = LOAD64(state[ 5]); \
+    X##ge = LOAD64(state[ 6]); \
+    X##gi = LOAD64(state[ 7]); \
+    X##go = LOAD64(state[ 8]); \
+    X##gu = LOAD64(state[ 9]); \
+    X##ka = LOAD64(state[10]); \
+    X##ke = LOAD64(state[11]); \
+    X##ki = LOAD64(state[12]); \
+    X##ko = LOAD64(state[13]); \
+    X##ku = LOAD64(state[14]); \
+    X##ma = LOAD64(state[15]); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyToState(state, X) \
+    STORE64(state[ 0], X##ba); \
+    STORE64(state[ 1], X##be); \
+    STORE64(state[ 2], X##bi); \
+    STORE64(state[ 3], X##bo); \
+    STORE64(state[ 4], X##bu); \
+    STORE64(state[ 5], X##ga); \
+    STORE64(state[ 6], X##ge); \
+    STORE64(state[ 7], X##gi); \
+    STORE64(state[ 8], X##go); \
+    STORE64(state[ 9], X##gu); \
+    STORE64(state[10], X##ka); \
+    STORE64(state[11], X##ke); \
+    STORE64(state[12], X##ki); \
+    STORE64(state[13], X##ko); \
+    STORE64(state[14], X##ku); \
+    STORE64(state[15], X##ma); \
+    STORE64(state[16], X##me); \
+    STORE64(state[17], X##mi); \
+    STORE64(state[18], X##mo); \
+    STORE64(state[19], X##mu); \
+    STORE64(state[20], X##sa); \
+    STORE64(state[21], X##se); \
+    STORE64(state[22], X##si); \
+    STORE64(state[23], X##so); \
+    STORE64(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
diff --git a/Modules/_sha3/keccak/KeccakF-1600-unrolling.macros b/Modules/_sha3/keccak/KeccakF-1600-unrolling.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-unrolling.macros
@@ -0,0 +1,124 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (Unrolling == 24)
+#define rounds \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+    copyToState(state, A)
+#elif (Unrolling == 12)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=12) { \
+        thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 8)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=8) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+7, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 6)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 4)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 3)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 2)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 1)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+    copyToState(state, A)
+#else
+#error "Unrolling is not correctly specified!"
+#endif
diff --git a/Modules/_sha3/keccak/KeccakF-1600-xop.macros b/Modules/_sha3/keccak/KeccakF-1600-xop.macros
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakF-1600-xop.macros
@@ -0,0 +1,573 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    V128 Abage, Abegi, Abigo, Abogu, Abuga; \
+    V128 Akame, Akemi, Akimo, Akomu, Akuma; \
+    V128 Abae, Abio, Agae, Agio, Akae, Akio, Amae, Amio; \
+    V64 Aba, Abe, Abi, Abo, Abu; \
+    V64 Aga, Age, Agi, Ago, Agu; \
+    V64 Aka, Ake, Aki, Ako, Aku; \
+    V64 Ama, Ame, Ami, Amo, Amu; \
+    V128 Asase, Asiso; \
+    V64 Asu; \
+    V128 Bbage, Bbegi, Bbigo, Bbogu, Bbuga; \
+    V128 Bkame, Bkemi, Bkimo, Bkomu, Bkuma; \
+    V128 Bsase, Bsesi, Bsiso, Bsosu, Bsusa; \
+    V128 Cae, Cei, Cio, Cou, Cua; \
+    V128 Dau, Dea, Die, Doi, Duo; \
+    V128 Dua, Dae, Dei, Dio, Dou; \
+    V128 Ebage, Ebegi, Ebigo, Ebogu, Ebuga; \
+    V128 Ekame, Ekemi, Ekimo, Ekomu, Ekuma; \
+    V128 Esase, Esiso; \
+    V64 Esu; \
+    V128 Zero;
+
+#define prepareTheta
+
+#define computeD \
+    Cua = GET64LOLO(Cua, Cae); \
+    Dei = XOR128(Cae, ROL6464same(Cio, 1)); \
+    Dou = XOR128(Cio, ROL6464same(Cua, 1)); \
+    Cei = GET64HILO(Cae, Cio); \
+    Dae = XOR128(Cua, ROL6464same(Cei, 1)); \
+    Dau = GET64LOHI(Dae, Dou); \
+    Dea = SWAP64(Dae); \
+    Die = SWAP64(Dei); \
+    Doi = GET64LOLO(Dou, Die); \
+    Duo = SWAP64(Dou);
+
+/*  --- Theta Rho Pi Chi Iota Prepare-theta */
+/*  --- 64-bit lanes mapped to 64-bit and 128-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    computeD \
+    \
+    Bbage = XOR128(GET64LOHI(A##bage, A##bogu), Dau); \
+    Bbage = ROL6464(Bbage, 0, 20); \
+    Bbegi = XOR128(GET64HILO(A##bage, A##kame), Dea); \
+    Bbegi = ROL6464(Bbegi, 44, 3); \
+    Bbigo = XOR128(GET64LOHI(A##kimo, A##kame), Die); \
+    Bbigo = ROL6464(Bbigo, 43, 45); \
+    E##bage = XOR128(Bbage, ANDnu128(Bbegi, Bbigo)); \
+    XOReq128(E##bage, CONST64(KeccakF1600RoundConstants[i])); \
+    Cae = E##bage; \
+    Bbogu = XOR128(GET64HILO(A##kimo, A##siso), Doi); \
+    Bbogu = ROL6464(Bbogu, 21, 61); \
+    E##begi = XOR128(Bbegi, ANDnu128(Bbigo, Bbogu)); \
+    Cei = E##begi; \
+    Bbuga = XOR128(GET64LOLO(A##su, A##bogu), Duo); \
+    Bbuga = ROL6464(Bbuga, 14, 28); \
+    E##bigo = XOR128(Bbigo, ANDnu128(Bbogu, Bbuga)); \
+    Cio = E##bigo; \
+    E##bogu = XOR128(Bbogu, ANDnu128(Bbuga, Bbage)); \
+    Cou = E##bogu; \
+    E##buga = XOR128(Bbuga, ANDnu128(Bbage, Bbegi)); \
+    Cua = E##buga; \
+\
+    Bkame = XOR128(GET64LOHI(A##begi, A##buga), Dea); \
+    Bkame = ROL6464(Bkame, 1, 36); \
+    Bkemi = XOR128(GET64HILO(A##begi, A##kemi), Die); \
+    Bkemi = ROL6464(Bkemi, 6, 10); \
+    Bkimo = XOR128(GET64LOHI(A##komu, A##kemi), Doi); \
+    Bkimo = ROL6464(Bkimo, 25, 15); \
+    E##kame = XOR128(Bkame, ANDnu128(Bkemi, Bkimo)); \
+    XOReq128(Cae, E##kame); \
+    Bkomu = XOR128(GET64HIHI(A##komu, A##siso), Duo); \
+    Bkomu = ROL6464(Bkomu, 8, 56); \
+    E##kemi = XOR128(Bkemi, ANDnu128(Bkimo, Bkomu)); \
+    XOReq128(Cei, E##kemi); \
+    Bkuma = XOR128(GET64LOLO(A##sase, A##buga), Dau); \
+    Bkuma = ROL6464(Bkuma, 18, 27); \
+    E##kimo = XOR128(Bkimo, ANDnu128(Bkomu, Bkuma)); \
+    XOReq128(Cio, E##kimo); \
+    E##komu = XOR128(Bkomu, ANDnu128(Bkuma, Bkame)); \
+    XOReq128(Cou, E##komu); \
+    E##kuma = XOR128(Bkuma, ANDnu128(Bkame, Bkemi)); \
+    XOReq128(Cua, E##kuma); \
+\
+    Bsase = XOR128(A##bigo, SWAP64(Doi)); \
+    Bsase = ROL6464(Bsase, 62, 55); \
+    Bsiso = XOR128(A##kuma, SWAP64(Dau)); \
+    Bsiso = ROL6464(Bsiso, 39, 41); \
+    Bsusa = XOR64(COPY64HI2LO(A##sase), Dei); \
+    Bsusa = ROL6464same(Bsusa, 2); \
+    Bsusa = GET64LOLO(Bsusa, Bsase); \
+    Bsesi = GET64HILO(Bsase, Bsiso); \
+    Bsosu = GET64HILO(Bsiso, Bsusa); \
+    E##sase = XOR128(Bsase, ANDnu128(Bsesi, Bsiso)); \
+    XOReq128(Cae, E##sase); \
+    E##siso = XOR128(Bsiso, ANDnu128(Bsosu, Bsusa)); \
+    XOReq128(Cio, E##siso); \
+    E##su = GET64LOLO(XOR128(Bsusa, ANDnu128(Bsase, Bsesi)), Zero); \
+    XOReq128(Cua, E##su); \
+\
+    Zero = ZERO128(); \
+    XOReq128(Cae, GET64HIHI(Cua, Zero)); \
+    XOReq128(Cae, GET64LOLO(Zero, Cei)); \
+    XOReq128(Cio, GET64HIHI(Cei, Zero)); \
+    XOReq128(Cio, GET64LOLO(Zero, Cou)); \
+    XOReq128(Cua, GET64HIHI(Cou, Zero)); \
+
+/*  --- Theta Rho Pi Chi Iota */
+/*  --- 64-bit lanes mapped to 64-bit and 128-bit words */
+#define thetaRhoPiChiIota(i, A, E) thetaRhoPiChiIotaPrepareTheta(i, A, E)
+
+static const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = LOAD64(state[ 9]); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = LOAD128(state[10]); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = LOAD128(state[12]); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = LOAD128(state[14]); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD64(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = LOAD128(state[14]); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = XOR128(LOAD128u(state[17]), LOAD64(input[17])); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = XOR128(LOAD128u(state[17]), LOAD128u(input[17])); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = XOR64(LOAD64(state[19]), LOAD64(input[19])); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = XOR128(LOAD128(state[20]), LOAD64(input[20])); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromState(X, state) \
+    X##bae = LOAD128(state[ 0]); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = LOAD128(state[ 2]); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = LOAD64(state[ 4]); \
+    Cua = X##bu; \
+    X##gae = LOAD128u(state[ 5]); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = LOAD128u(state[ 7]); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = LOAD64(state[ 9]); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = LOAD128(state[10]); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = LOAD128(state[12]); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = LOAD128(state[14]); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyToState(state, X) \
+    STORE64(state[ 0], X##bage); \
+    STORE64(state[ 1], X##begi); \
+    STORE64(state[ 2], X##bigo); \
+    STORE64(state[ 3], X##bogu); \
+    STORE128(state[ 4], X##buga); \
+    STORE64(state[ 6], COPY64HI2LO(X##bage)); \
+    STORE64(state[ 7], COPY64HI2LO(X##begi)); \
+    STORE64(state[ 8], COPY64HI2LO(X##bigo)); \
+    STORE64(state[ 9], COPY64HI2LO(X##bogu)); \
+    STORE64(state[10], X##kame); \
+    STORE64(state[11], X##kemi); \
+    STORE64(state[12], X##kimo); \
+    STORE64(state[13], X##komu); \
+    STORE128(state[14], X##kuma); \
+    STORE64(state[16], COPY64HI2LO(X##kame)); \
+    STORE64(state[17], COPY64HI2LO(X##kemi)); \
+    STORE64(state[18], COPY64HI2LO(X##kimo)); \
+    STORE64(state[19], COPY64HI2LO(X##komu)); \
+    STORE128(state[20], X##sase); \
+    STORE128(state[22], X##siso); \
+    STORE64(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+    X##bage = Y##bage; \
+    X##begi = Y##begi; \
+    X##bigo = Y##bigo; \
+    X##bogu = Y##bogu; \
+    X##buga = Y##buga; \
+    X##kame = Y##kame; \
+    X##kemi = Y##kemi; \
+    X##kimo = Y##kimo; \
+    X##komu = Y##komu; \
+    X##kuma = Y##kuma; \
+    X##sase = Y##sase; \
+    X##siso = Y##siso; \
+    X##su = Y##su; \
+
diff --git a/Modules/_sha3/keccak/KeccakNISTInterface.c b/Modules/_sha3/keccak/KeccakNISTInterface.c
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakNISTInterface.c
@@ -0,0 +1,83 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakNISTInterface.h"
+#include "KeccakF-1600-interface.h"
+
+static HashReturn Init(hashState *state, int hashbitlen)
+{
+    switch(hashbitlen) {
+        case 0: /*  Default parameters, arbitrary length output */
+            InitSponge((spongeState*)state, 1024, 576);
+            break;
+        case 224:
+            InitSponge((spongeState*)state, 1152, 448);
+            break;
+        case 256:
+            InitSponge((spongeState*)state, 1088, 512);
+            break;
+        case 384:
+            InitSponge((spongeState*)state, 832, 768);
+            break;
+        case 512:
+            InitSponge((spongeState*)state, 576, 1024);
+            break;
+        default:
+            return BAD_HASHLEN;
+    }
+    state->fixedOutputLength = hashbitlen;
+    return SUCCESS;
+}
+
+static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) == 0)
+        return Absorb((spongeState*)state, data, databitlen);
+    else {
+        HashReturn ret = Absorb((spongeState*)state, data, databitlen - (databitlen % 8));
+        if (ret == SUCCESS) {
+            unsigned char lastByte; 
+            /*  Align the last partial byte to the least significant bits */
+            lastByte = data[databitlen/8] >> (8 - (databitlen % 8));
+            return Absorb((spongeState*)state, &lastByte, databitlen % 8);
+        }
+        else
+            return ret;
+    }
+}
+
+static HashReturn Final(hashState *state, BitSequence *hashval)
+{
+    return Squeeze(state, hashval, state->fixedOutputLength);
+}
+
+/*
+static HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
+{
+    hashState state;
+    HashReturn result;
+
+    if ((hashbitlen != 224) && (hashbitlen != 256) && (hashbitlen != 384) && (hashbitlen != 512))
+        return BAD_HASHLEN; *  Only the four fixed output lengths available through this API *
+    result = Init(&state, hashbitlen);
+    if (result != SUCCESS)
+        return result;
+    result = Update(&state, data, databitlen);
+    if (result != SUCCESS)
+        return result;
+    result = Final(&state, hashval);
+    return result;
+}
+*/
+
diff --git a/Modules/_sha3/keccak/KeccakNISTInterface.h b/Modules/_sha3/keccak/KeccakNISTInterface.h
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakNISTInterface.h
@@ -0,0 +1,72 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakNISTInterface_h_
+#define _KeccakNISTInterface_h_
+
+#include "KeccakSponge.h"
+
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+
+typedef spongeState hashState;
+
+/**
+  * Function to initialize the state of the Keccak[r, c] sponge function.
+  * The rate r and capacity c values are determined from @a hashbitlen.
+  * @param  state       Pointer to the state of the sponge function to be initialized.
+  * @param  hashbitlen  The desired number of output bits, 
+  *                     or 0 for Keccak[] with default parameters
+  *                     and arbitrarily-long output.
+  * @pre    The value of hashbitlen must be one of 0, 224, 256, 384 and 512.
+  * @return SUCCESS if successful, BAD_HASHLEN if the value of hashbitlen is incorrect.
+  */
+static HashReturn Init(hashState *state, int hashbitlen);
+/**
+  * Function to give input data for the sponge function to absorb.
+  * @param  state       Pointer to the state of the sponge function initialized by Init().
+  * @param  data        Pointer to the input data. 
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the most significant bits of the last byte.
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @pre    In the previous call to Absorb(), databitLen was a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+/**
+  * Function to squeeze output data from the sponge function.
+  * If @a hashbitlen was not 0 in the call to Init(), the number of output bits is equal to @a hashbitlen.
+  * If @a hashbitlen was 0 in the call to Init(), the output bits must be extracted using the Squeeze() function.
+  * @param  state       Pointer to the state of the sponge function initialized by Init().
+  * @param  hashval     Pointer to the buffer where to store the output data.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+static HashReturn Final(hashState *state, BitSequence *hashval);
+/**
+  * Function to compute a hash using the Keccak[r, c] sponge function.
+  * The rate r and capacity c values are determined from @a hashbitlen.
+  * @param  hashbitlen  The desired number of output bits.
+  * @param  data        Pointer to the input data. 
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the most significant bits of the last byte.
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @param  hashval     Pointer to the buffer where to store the output data.
+  * @pre    The value of hashbitlen must be one of 224, 256, 384 and 512.
+  * @return SUCCESS if successful, BAD_HASHLEN if the value of hashbitlen is incorrect.
+  */
+/*
+static HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+*/
+
+#endif
diff --git a/Modules/_sha3/keccak/KeccakSponge.c b/Modules/_sha3/keccak/KeccakSponge.c
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakSponge.c
@@ -0,0 +1,266 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakSponge.h"
+#include "KeccakF-1600-interface.h"
+#ifdef KeccakReference
+#include "displayIntermediateValues.h"
+#endif
+
+static int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity)
+{
+    if (rate+capacity != 1600)
+        return 1;
+    if ((rate <= 0) || (rate >= 1600) || ((rate % 64) != 0))
+        return 1;
+    KeccakInitialize();
+    state->rate = rate;
+    state->capacity = capacity;
+    state->fixedOutputLength = 0;
+    KeccakInitializeState(state->state);
+    memset(state->dataQueue, 0, KeccakMaximumRateInBytes);
+    state->bitsInQueue = 0;
+    state->squeezing = 0;
+    state->bitsAvailableForSqueezing = 0;
+
+    return 0;
+}
+
+static void AbsorbQueue(spongeState *state)
+{
+    /*  state->bitsInQueue is assumed to be equal to state->rate */
+    #ifdef KeccakReference
+    displayBytes(1, "Block to be absorbed", state->dataQueue, state->rate/8);
+    #endif
+#ifdef ProvideFast576
+    if (state->rate == 576)
+        KeccakAbsorb576bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast832
+    if (state->rate == 832)
+        KeccakAbsorb832bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast1024
+    if (state->rate == 1024)
+        KeccakAbsorb1024bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast1088
+    if (state->rate == 1088)
+        KeccakAbsorb1088bits(state->state, state->dataQueue);
+    else
+#endif
+#ifdef ProvideFast1152
+    if (state->rate == 1152)
+        KeccakAbsorb1152bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast1344
+    if (state->rate == 1344)
+        KeccakAbsorb1344bits(state->state, state->dataQueue);
+    else 
+#endif
+        KeccakAbsorb(state->state, state->dataQueue, state->rate/64);
+    state->bitsInQueue = 0;
+}
+
+static int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen)
+{
+    unsigned long long i, j, wholeBlocks;
+    unsigned int partialBlock, partialByte;
+    const unsigned char *curData;
+
+    if ((state->bitsInQueue % 8) != 0)
+        return 1; /*  Only the last call may contain a partial byte */
+    if (state->squeezing)
+        return 1; /*  Too late for additional input */
+
+    i = 0;
+    while(i < databitlen) {
+        if ((state->bitsInQueue == 0) && (databitlen >= state->rate) && (i <= (databitlen-state->rate))) {
+            wholeBlocks = (databitlen-i)/state->rate;
+            curData = data+i/8;
+#ifdef ProvideFast576
+            if (state->rate == 576) {
+                for(j=0; j<wholeBlocks; j++, curData+=576/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb576bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast832
+            if (state->rate == 832) {
+                for(j=0; j<wholeBlocks; j++, curData+=832/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb832bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1024
+            if (state->rate == 1024) {
+                for(j=0; j<wholeBlocks; j++, curData+=1024/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1024bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1088
+            if (state->rate == 1088) {
+                for(j=0; j<wholeBlocks; j++, curData+=1088/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1088bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1152
+            if (state->rate == 1152) {
+                for(j=0; j<wholeBlocks; j++, curData+=1152/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1152bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1344
+            if (state->rate == 1344) {
+                for(j=0; j<wholeBlocks; j++, curData+=1344/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1344bits(state->state, curData);
+                }
+            }
+            else
+#endif
+            {
+                for(j=0; j<wholeBlocks; j++, curData+=state->rate/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb(state->state, curData, state->rate/64);
+                }
+            }
+            i += wholeBlocks*state->rate;
+        }
+        else {
+            partialBlock = (unsigned int)(databitlen - i);
+            if (partialBlock+state->bitsInQueue > state->rate)
+                partialBlock = state->rate-state->bitsInQueue;
+            partialByte = partialBlock % 8;
+            partialBlock -= partialByte;
+            memcpy(state->dataQueue+state->bitsInQueue/8, data+i/8, partialBlock/8);
+            state->bitsInQueue += partialBlock;
+            i += partialBlock;
+            if (state->bitsInQueue == state->rate)
+                AbsorbQueue(state);
+            if (partialByte > 0) {
+                unsigned char mask = (1 << partialByte)-1;
+                state->dataQueue[state->bitsInQueue/8] = data[i/8] & mask;
+                state->bitsInQueue += partialByte;
+                i += partialByte;
+            }
+        }
+    }
+    return 0;
+}
+
+static void PadAndSwitchToSqueezingPhase(spongeState *state)
+{
+    /*  Note: the bits are numbered from 0=LSB to 7=MSB */
+    if (state->bitsInQueue + 1 == state->rate) {
+        state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8);
+        AbsorbQueue(state);
+        memset(state->dataQueue, 0, state->rate/8);
+    }
+    else {
+        memset(state->dataQueue + (state->bitsInQueue+7)/8, 0, state->rate/8 - (state->bitsInQueue+7)/8);
+        state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8);
+    }
+    state->dataQueue[(state->rate-1)/8] |= 1 << ((state->rate-1) % 8);
+    AbsorbQueue(state);
+
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+#ifdef ProvideFast1024
+    if (state->rate == 1024) {
+        KeccakExtract1024bits(state->state, state->dataQueue);
+        state->bitsAvailableForSqueezing = 1024;
+    }
+    else
+#endif
+    {
+        KeccakExtract(state->state, state->dataQueue, state->rate/64);
+        state->bitsAvailableForSqueezing = state->rate;
+    }
+    #ifdef KeccakReference
+    displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8);
+    #endif
+    state->squeezing = 1;
+}
+
+static int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength)
+{
+    unsigned long long i;
+    unsigned int partialBlock;
+
+    if (!state->squeezing)
+        PadAndSwitchToSqueezingPhase(state);
+    if ((outputLength % 8) != 0)
+        return 1; /*  Only multiple of 8 bits are allowed, truncation can be done at user level */
+
+    i = 0;
+    while(i < outputLength) {
+        if (state->bitsAvailableForSqueezing == 0) {
+            KeccakPermutation(state->state);
+#ifdef ProvideFast1024
+            if (state->rate == 1024) {
+                KeccakExtract1024bits(state->state, state->dataQueue);
+                state->bitsAvailableForSqueezing = 1024;
+            }
+            else
+#endif
+            {
+                KeccakExtract(state->state, state->dataQueue, state->rate/64);
+                state->bitsAvailableForSqueezing = state->rate;
+            }
+            #ifdef KeccakReference
+            displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8);
+            #endif
+        }
+        partialBlock = state->bitsAvailableForSqueezing;
+        if ((unsigned long long)partialBlock > outputLength - i)
+            partialBlock = (unsigned int)(outputLength - i);
+        memcpy(output+i/8, state->dataQueue+(state->rate-state->bitsAvailableForSqueezing)/8, partialBlock/8);
+        state->bitsAvailableForSqueezing -= partialBlock;
+        i += partialBlock;
+    }
+    return 0;
+}
diff --git a/Modules/_sha3/keccak/KeccakSponge.h b/Modules/_sha3/keccak/KeccakSponge.h
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/keccak/KeccakSponge.h
@@ -0,0 +1,76 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSponge_h_
+#define _KeccakSponge_h_
+
+#define KeccakPermutationSize 1600
+#define KeccakPermutationSizeInBytes (KeccakPermutationSize/8)
+#define KeccakMaximumRate 1536
+#define KeccakMaximumRateInBytes (KeccakMaximumRate/8)
+
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+ALIGN typedef struct spongeStateStruct {
+    ALIGN unsigned char state[KeccakPermutationSizeInBytes];
+    ALIGN unsigned char dataQueue[KeccakMaximumRateInBytes];
+    unsigned int rate;
+    unsigned int capacity;
+    unsigned int bitsInQueue;
+    unsigned int fixedOutputLength;
+    int squeezing;
+    unsigned int bitsAvailableForSqueezing;
+} spongeState;
+
+/**
+  * Function to initialize the state of the Keccak[r, c] sponge function.
+  * The sponge function is set to the absorbing phase.
+  * @param  state       Pointer to the state of the sponge function to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @pre    One must have r+c=1600 and the rate a multiple of 64 bits in this implementation.
+  * @return Zero if successful, 1 otherwise.
+  */
+static int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity);
+/**
+  * Function to give input data for the sponge function to absorb.
+  * @param  state       Pointer to the state of the sponge function initialized by InitSponge().
+  * @param  data        Pointer to the input data. 
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the least significant bits of the last byte.
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @pre    In the previous call to Absorb(), databitLen was a multiple of 8.
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Squeeze() must not have been called before.
+  * @return Zero if successful, 1 otherwise.
+  */
+static int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen);
+/**
+  * Function to squeeze output data from the sponge function.
+  * If the sponge function was in the absorbing phase, this function 
+  * switches it to the squeezing phase.
+  * @param  state       Pointer to the state of the sponge function initialized by InitSponge().
+  * @param  output      Pointer to the buffer where to store the output data.
+  * @param  outputLength    The number of output bits desired.
+  *                     It must be a multiple of 8.
+  * @return Zero if successful, 1 otherwise.
+  */
+static int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength);
+
+#endif
diff --git a/Modules/_sha3/keccak/crypto_hash.h b/Modules/_sha3/keccak/crypto_hash.h
new file mode 100644
diff --git a/Modules/_sha3/sha3module.c b/Modules/_sha3/sha3module.c
new file mode 100644
--- /dev/null
+++ b/Modules/_sha3/sha3module.c
@@ -0,0 +1,569 @@
+/* SHA3 module
+ *
+ * This module provides an interface to the SHA3 algorithm
+ *
+ * See below for information about the original code this module was
+ * based upon. Additional work performed by:
+ *
+ *  Andrew Kuchling (amk at amk.ca)
+ *  Greg Stein (gstein at lyra.org)
+ *  Trevor Perrin (trevp at trevp.net)
+ *  Gregory P. Smith (greg at krypto.org)
+ *
+ *  Copyright (C) 2012   Christian Heimes (christian at python.org)
+ *  Licensed to PSF under a Contributor Agreement.
+ *
+ */
+
+#include "Python.h"
+#include "../hashlib.h"
+
+/* **************************************************************************
+ *                             SHA-3 (Keccak)
+ *
+ * The code is based on KeccakReferenceAndOptimized-3.2.zip from 29 May 2012.
+ *
+ * The reference implementation is altered in this points:
+ *  - C++ comments are converted to ANSI C comments.
+ *  - All functions and globals are declared static.
+ *  - The typedef for UINT64 is commented out.
+ *  - brg_endian.h is removed.
+ *  - KeccakF-1600-opt[32|64]-settings.h are commented out
+ *  - Some unused functions are commented out to silence compiler warnings.
+ *
+ * In order to avoid name clashes with other software I have to declare all
+ * Keccak functions and global data as static. The C code is directly
+ * included into this file in order to access the static functions.
+ *
+ * Keccak can be tuned with several paramenters. I try to explain all options
+ * as far as I understand them. The reference implementation also contains
+ * assembler code for ARM platforms (NEON instructions).
+ *
+ * Common
+ * ======
+ *
+ * Options:
+ *   UseBebigokimisa, Unrolling
+ *
+ * - Unrolling: loop unrolling (24, 12, 8, 6, 4, 3, 2, 1)
+ * - UseBebigokimisa: lane complementing
+ *
+ * 64bit platforms
+ * ===============
+ *
+ * Additional options:
+ *   UseSSE, UseOnlySIMD64, UseMMX, UseXOP, UseSHLD
+ *
+ * Optimized instructions (disabled by default):
+ *   - UseSSE: use Stream SIMD extensions
+ *     o UseOnlySIMD64: limit to 64bit instructions, otherwise 128bit
+ *     o w/o UseOnlySIMD64: requires compiler agument -mssse3 or -mtune
+ *   - UseMMX: use 64bit MMX instructions
+ *   - UseXOP: use AMD's eXtended Operations (128bit SSE extension)
+ *
+ * Other:
+ *   - Unrolling: default 24
+ *   - UseBebigokimisa: default 1
+ *
+ * When neither UseSSE, UseMMX nor UseXOP is configured, ROL64 (rotate left
+ * 64) is implemented as:
+ *   - Windows: _rotl64()
+ *   - UseSHLD: use shld (shift left) asm optimization
+ *   - otherwise: shift and xor
+ *
+ * UseBebigokimisa can't be used in combination with UseSSE, UseMMX or
+ * UseXOP. UseOnlySIMD64 has no effect unless UseSSE is specified.
+ *
+ * Tests have shown that UseSSE + UseOnlySIMD64 is about three to four
+ * times SLOWER than UseBebigokimisa. UseSSE and UseMMX are about two times
+ * slower. (tested by CH and AP)
+ *
+ * 32bit platforms
+ * ===============
+ *
+ * Additional options:
+ *   UseInterleaveTables, UseSchedule
+ *
+ *   - Unrolling: default 2
+ *   - UseBebigokimisa: default n/a
+ *   - UseSchedule: ???, (1, 2, 3; default 3)
+ *   - UseInterleaveTables: use two 64k lookup tables for (de)interleaving
+ *     default: n/a
+ *
+ * schedules:
+ *   - 3: no UseBebigokimisa, Unrolling must be 2
+ *   - 2 + 1: ???
+ *
+ * *************************************************************************/
+
+#if SIZEOF_VOID_P == 8 && defined(PY_UINT64_T)
+ /* 64bit platforms with unsigned int64 */
+  #define KeccakImplementation 64
+  #define Unrolling 24
+  #define UseBebigokimisa
+  typedef PY_UINT64_T UINT64;
+#elif SIZEOF_VOID_P == 4  && defined(PY_UINT64_T)
+  /* 32bit platforms with unsigned int64 */
+  #define KeccakImplementation 32
+  #define Unrolling 2
+  #define UseSchedule 3
+  typedef PY_UINT64_T UINT64;
+#else
+  /* 32 or 64bit platforms without unsigned int64 */
+  #warning no uint64_t available, force Keccak opt32 with interleave tables
+  #define KeccakImplementation 32
+  #define Unrolling 2
+  #define UseSchedule 3
+  #define UseInterleaveTables
+#endif
+
+/* replacement for brg_endian.h */
+#define IS_BIG_ENDIAN BIG_ENDIAN
+#define IS_LITTLE_ENDIAN LITTLE_ENDIAN
+#define PLATFORM_BYTE_ORDER BYTE_ORDER
+
+/* inline all Keccak dependencies */
+#include "keccak/KeccakNISTInterface.h"
+#include "keccak/KeccakNISTInterface.c"
+#include "keccak/KeccakSponge.c"
+#if KeccakImplementation == 64
+  #include "keccak/KeccakF-1600-opt64.c"
+#elif KeccakImplementation == 32
+  #include "keccak/KeccakF-1600-opt32.c"
+#endif
+
+#define SHA3_BLOCKSIZE 200 /* 1600 bits  */
+#define SHA3_MAX_DIGESTSIZE 64 /* 512 bits */
+#define SHA3_state hashState
+#define SHA3_init Init
+#define SHA3_process Update
+#define SHA3_done Final
+#define SHA3_copystate(dest, src) memcpy(&(dest), &(src), sizeof(SHA3_state))
+#define SHA3_clearstate(state) memset(&(state), 0, sizeof(SHA3_state))
+
+/* The structure for storing SHA3 info */
+
+typedef struct {
+    PyObject_HEAD
+    int hashbitlen;
+    SHA3_state hash_state;
+#ifdef WITH_THREAD
+    PyThread_type_lock lock;
+#endif
+
+} SHA3object;
+
+static PyTypeObject SHA3type;
+
+
+static SHA3object *
+newSHA3object(int hashbitlen)
+{
+    SHA3object *newobj;
+
+    /* check hashbitlen */
+    switch(hashbitlen) {
+        /* supported hash length */
+        case 224:
+            break;
+        case 256:
+            break;
+        case 384:
+            break;
+        case 512:
+            break;
+        case 0:
+            /*  arbitrarily-long output isn't supported by this module */
+        default:
+            /* everything else is an error */
+            PyErr_SetString(PyExc_ValueError,
+                    "hashbitlen must be one of 224, 256, 384 or 512.");
+            return NULL;
+    }
+    newobj = (SHA3object *)PyObject_New(SHA3object, &SHA3type);
+    if (newobj == NULL) {
+        return NULL;
+    }
+    newobj->hashbitlen = hashbitlen;
+#ifdef WITH_THREAD
+    newobj->lock = NULL;
+#endif
+    return newobj;
+}
+
+
+/* Internal methods for a hash object */
+
+static void
+SHA3_dealloc(SHA3object *self)
+{
+    SHA3_clearstate(self->hash_state);
+#ifdef WITH_THREAD
+    if (self->lock) {
+        PyThread_free_lock(self->lock);
+    }
+#endif
+    PyObject_Del(self);
+}
+
+
+/* External methods for a hash object */
+
+PyDoc_STRVAR(SHA3_copy__doc__, "Return a copy of the hash object.");
+
+static PyObject *
+SHA3_copy(SHA3object *self, PyObject *unused)
+{
+    SHA3object *newobj;
+
+    if ((newobj = newSHA3object(self->hashbitlen)) == NULL) {
+        return NULL;
+    }
+    ENTER_HASHLIB(self);
+    SHA3_copystate(newobj->hash_state, self->hash_state);
+    LEAVE_HASHLIB(self);
+    return (PyObject *)newobj;
+}
+
+
+PyDoc_STRVAR(SHA3_digest__doc__,
+"Return the digest value as a string of binary data.");
+
+static PyObject *
+SHA3_digest(SHA3object *self, PyObject *unused)
+{
+    unsigned char digest[SHA3_MAX_DIGESTSIZE];
+    SHA3_state temp;
+    HashReturn res;
+
+    ENTER_HASHLIB(self);
+    SHA3_copystate(temp, self->hash_state);
+    LEAVE_HASHLIB(self);
+    res = SHA3_done(&temp, digest);
+    SHA3_clearstate(temp);
+    if (res != SUCCESS) {
+        PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Final()");
+        return NULL;
+    }
+    return PyBytes_FromStringAndSize((const char *)digest,
+                                      self->hashbitlen / 8);
+}
+
+
+PyDoc_STRVAR(SHA3_hexdigest__doc__,
+"Return the digest value as a string of hexadecimal digits.");
+
+static PyObject *
+SHA3_hexdigest(SHA3object *self, PyObject *unused)
+{
+    unsigned char digest[SHA3_MAX_DIGESTSIZE];
+    SHA3_state temp;
+    HashReturn res;
+    PyObject *retval;
+    Py_UCS1 *hex_digest;
+    int digestlen, i, j;
+
+    /* Get the raw (binary) digest value */
+    ENTER_HASHLIB(self);
+    SHA3_copystate(temp, self->hash_state);
+    LEAVE_HASHLIB(self);
+    res = SHA3_done(&temp, digest);
+    SHA3_clearstate(temp);
+    if (res != SUCCESS) {
+        PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Final()");
+        return NULL;
+    }
+
+    /* Create a new string */
+    digestlen = self->hashbitlen / 8;
+    retval = PyUnicode_New(digestlen * 2, 127);
+    if (!retval)
+            return NULL;
+    hex_digest = PyUnicode_1BYTE_DATA(retval);
+
+    /* Make hex version of the digest */
+    for(i=j=0; i < digestlen; i++) {
+        unsigned char c;
+        c = (digest[i] >> 4) & 0xf;
+        hex_digest[j++] = Py_hexdigits[c];
+        c = (digest[i] & 0xf);
+        hex_digest[j++] = Py_hexdigits[c];
+    }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
+    return retval;
+}
+
+PyDoc_STRVAR(SHA3_update__doc__,
+"Update this hash object's state with the provided string.");
+
+static PyObject *
+SHA3_update(SHA3object *self, PyObject *args)
+{
+    PyObject *obj;
+    Py_buffer buf;
+    HashReturn res;
+
+    if (!PyArg_ParseTuple(args, "O:update", &obj))
+        return NULL;
+
+    GET_BUFFER_VIEW_OR_ERROUT(obj, &buf);
+
+    /* add new data, the function takes the length in bits not bytes */
+#ifdef WITH_THREADS
+    if (self->lock == NULL && buf.len >= HASHLIB_GIL_MINSIZE) {
+        self->lock = PyThread_allocate_lock();
+    }
+    /* Once a lock exists all code paths must be synchronized. We have to
+     * release the GIL even for small buffers as acquiring the lock may take
+     * an unlimited amount of time when another thread updates this object
+     * with lots of data. */
+    if (self->lock) {
+        Py_BEGIN_ALLOW_THREADS
+        PyThread_acquire_lock(self->lock, 1);
+        res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+        PyThread_release_lock(self->lock);
+        Py_END_ALLOW_THREADS
+    }
+    else {
+        res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+    }
+#else
+    res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+#endif
+    LEAVE_HASHLIB(self);
+
+    if (res != SUCCESS) {
+        PyBuffer_Release(&buf);
+        PyErr_SetString(PyExc_RuntimeError,
+                        "internal error in SHA3 Update()");
+        return NULL;
+    }
+
+    PyBuffer_Release(&buf);
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+
+static PyMethodDef SHA3_methods[] = {
+    {"copy",      (PyCFunction)SHA3_copy,      METH_NOARGS,
+         SHA3_copy__doc__},
+    {"digest",    (PyCFunction)SHA3_digest,    METH_NOARGS,
+         SHA3_digest__doc__},
+    {"hexdigest", (PyCFunction)SHA3_hexdigest, METH_NOARGS,
+         SHA3_hexdigest__doc__},
+    {"update",    (PyCFunction)SHA3_update,    METH_VARARGS,
+         SHA3_update__doc__},
+    {NULL,        NULL}         /* sentinel */
+};
+
+static PyObject *
+SHA3_get_block_size(SHA3object *self, void *closure)
+{
+    return PyLong_FromLong(SHA3_BLOCKSIZE);
+}
+
+static PyObject *
+SHA3_get_name(SHA3object *self, void *closure)
+{
+    return PyUnicode_FromFormat("sha3_%i", self->hashbitlen);
+}
+
+static PyObject *
+SHA3_get_digest_size(SHA3object *self, void *closure)
+{
+    return PyLong_FromLong(self->hashbitlen / 8);
+}
+
+
+static PyGetSetDef SHA3_getseters[] = {
+    {"block_size", (getter)SHA3_get_block_size, NULL, NULL, NULL},
+    {"name", (getter)SHA3_get_name, NULL, NULL, NULL},
+    {"digest_size", (getter)SHA3_get_digest_size, NULL, NULL, NULL},
+    {NULL}  /* Sentinel */
+};
+
+static PyTypeObject SHA3type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_sha3.SHA3",       /* tp_name */
+    sizeof(SHA3object), /* tp_size */
+    0,                  /* tp_itemsize */
+    /*  methods  */
+    (destructor)SHA3_dealloc, /* tp_dealloc */
+    0,                  /* tp_print */
+    0,                  /* tp_getattr */
+    0,                  /* tp_setattr */
+    0,                  /* tp_reserved */
+    0,                  /* tp_repr */
+    0,                  /* tp_as_number */
+    0,                  /* tp_as_sequence */
+    0,                  /* tp_as_mapping */
+    0,                  /* tp_hash */
+    0,                  /* tp_call */
+    0,                  /* tp_str */
+    0,                  /* tp_getattro */
+    0,                  /* tp_setattro */
+    0,                  /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT, /* tp_flags */
+    0,                  /* tp_doc */
+    0,                  /* tp_traverse */
+    0,                  /* tp_clear */
+    0,                  /* tp_richcompare */
+    0,                  /* tp_weaklistoffset */
+    0,                  /* tp_iter */
+    0,                  /* tp_iternext */
+    SHA3_methods,       /* tp_methods */
+    NULL,               /* tp_members */
+    SHA3_getseters,     /* tp_getset */
+};
+
+
+/* constructor helper */
+static PyObject *
+SHA3_factory(PyObject *args, PyObject *kwdict, const char *fmt,
+             int hashbitlen)
+{
+    SHA3object *newobj = NULL;
+    static char *kwlist[] = {"string", NULL};
+    PyObject *data_obj = NULL;
+    Py_buffer buf;
+    HashReturn res;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwdict, fmt, kwlist,
+                                     &data_obj)) {
+        return NULL;
+    }
+
+    if (data_obj)
+        GET_BUFFER_VIEW_OR_ERROUT(data_obj, &buf);
+
+    if ((newobj = newSHA3object(hashbitlen)) == NULL) {
+        goto error;
+    }
+
+    if (SHA3_init(&newobj->hash_state, hashbitlen) != SUCCESS) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "internal error in SHA3 Update()");
+        goto error;
+    }
+
+    if (data_obj) {
+#ifdef WITH_THREADS
+        if (buf.len >= HASHLIB_GIL_MINSIZE) {
+            /* invariant: New objects can't be accessed by other code yet,
+             * thus it's safe to release the GIL without locking the object.
+             */
+            Py_BEGIN_ALLOW_THREADS
+            res = SHA3_process(&newobj->hash_state, buf.buf, buf.len * 8);
+            Py_END_ALLOW_THREADS
+        }
+        else {
+            res = SHA3_process(&newobj->hash_state, buf.buf, buf.len * 8);
+        }
+#else
+        res = SHA3_process(&newobj->hash_state, buf.buf, buf.len * 8);
+#endif
+        if (res != SUCCESS) {
+            PyErr_SetString(PyExc_RuntimeError,
+                            "internal error in SHA3 Update()");
+            goto error;
+        }
+        PyBuffer_Release(&buf);
+    }
+
+    return (PyObject *)newobj;
+
+  error:
+    if (newobj) {
+        SHA3_clearstate(newobj->hash_state);
+        /* self->lock is always NULL */
+    }
+    if (data_obj) {
+        PyBuffer_Release(&buf);
+    }
+    return NULL;
+
+}
+
+PyDoc_STRVAR(sha3_224__doc__,
+"sha3_224([string]) -> SHA3 object\n\
+\n\
+Return a new SHA3 hash object with a hashbit length of 28 bytes.");
+
+static PyObject *
+sha3_224(PyObject *self, PyObject *args, PyObject *kwdict)
+{
+    return SHA3_factory(args, kwdict, "|O:sha3_224", 224);
+}
+
+
+PyDoc_STRVAR(sha3_256__doc__,
+"sha3_256([string]) -> SHA3 object\n\
+\n\
+Return a new SHA3 hash object with a hashbit length of 32 bytes.");
+
+static PyObject *
+sha3_256(PyObject *self, PyObject *args, PyObject *kwdict)
+{
+    return SHA3_factory(args, kwdict, "|O:sha3_256", 256);
+}
+
+PyDoc_STRVAR(sha3_384__doc__,
+"sha3_384([string]) -> SHA3 object\n\
+\n\
+Return a new SHA3 hash object with a hashbit length of 48 bytes.");
+
+static PyObject *
+sha3_384(PyObject *self, PyObject *args, PyObject *kwdict)
+{
+    return SHA3_factory(args, kwdict, "|O:sha3_384", 384);
+}
+
+PyDoc_STRVAR(sha3_512__doc__,
+"sha3_512([string]) -> SHA3 object\n\
+\n\
+Return a new SHA3 hash object with a hashbit length of 64 bytes.");
+
+static PyObject *
+sha3_512(PyObject *self, PyObject *args, PyObject *kwdict)
+{
+    return SHA3_factory(args, kwdict, "|O:sha3_512", 512);
+}
+
+
+/* List of functions exported by this module */
+static struct PyMethodDef SHA3_functions[] = {
+    {"sha3_224", (PyCFunction)sha3_224, METH_VARARGS|METH_KEYWORDS,
+         sha3_224__doc__},
+    {"sha3_256", (PyCFunction)sha3_256, METH_VARARGS|METH_KEYWORDS,
+         sha3_256__doc__},
+    {"sha3_384", (PyCFunction)sha3_384, METH_VARARGS|METH_KEYWORDS,
+         sha3_384__doc__},
+    {"sha3_512", (PyCFunction)sha3_512, METH_VARARGS|METH_KEYWORDS,
+         sha3_512__doc__},
+    {NULL,      NULL}            /* Sentinel */
+};
+
+
+/* Initialize this module. */
+static struct PyModuleDef _SHA3module = {
+        PyModuleDef_HEAD_INIT,
+        "_sha3",
+        NULL,
+        -1,
+        SHA3_functions,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+
+PyMODINIT_FUNC
+PyInit__sha3(void)
+{
+    Py_TYPE(&SHA3type) = &PyType_Type;
+    if (PyType_Ready(&SHA3type) < 0) {
+        return NULL;
+    }
+
+    return PyModule_Create(&_SHA3module);
+}
diff --git a/Modules/hashlib.h b/Modules/hashlib.h
--- a/Modules/hashlib.h
+++ b/Modules/hashlib.h
@@ -26,3 +26,36 @@
             return NULL; \
         } \
     } while(0);
+
+/*
+ * Helper code to synchronize access to the hash object when the GIL is
+ * released around a CPU consuming hashlib operation. All code paths that
+ * access a mutable part of obj must be enclosed in a ENTER_HASHLIB /
+ * LEAVE_HASHLIB block or explicitly acquire and release the lock inside
+ * a PY_BEGIN / END_ALLOW_THREADS block if they wish to release the GIL for
+ * an operation.
+ */
+
+#ifdef WITH_THREAD
+#include "pythread.h"
+    #define ENTER_HASHLIB(obj) \
+        if ((obj)->lock) { \
+            if (!PyThread_acquire_lock((obj)->lock, 0)) { \
+                Py_BEGIN_ALLOW_THREADS \
+                PyThread_acquire_lock((obj)->lock, 1); \
+                Py_END_ALLOW_THREADS \
+            } \
+        }
+    #define LEAVE_HASHLIB(obj) \
+        if ((obj)->lock) { \
+            PyThread_release_lock((obj)->lock); \
+        }
+#else
+    #define ENTER_HASHLIB(obj)
+    #define LEAVE_HASHLIB(obj)
+#endif
+
+/* TODO(gps): We should probably make this a module or EVPobject attribute
+ * to allow the user to optimize based on the platform they're using. */
+#define HASHLIB_GIL_MINSIZE 2048
+
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -838,6 +838,15 @@
         exts.append( Extension('_sha1', ['sha1module.c'],
                                depends=['hashlib.h']) )
 
+        # SHA-3 (Keccak) module
+        sha3_depends = ['hashlib.h']
+        keccak = os.path.join(os.getcwd(), srcdir, 'Modules', '_sha3',
+                              'keccak')
+        for pattern in ('*.c', '*.h', '*.macros'):
+            sha3_depends.extend(glob(os.path.join(keccak, pattern)))
+        exts.append(Extension("_sha3", ["_sha3/sha3module.c"],
+                              depends=sha3_depends))
+
         # Modules that provide persistent dictionary-like semantics.  You will
         # probably want to arrange for at least one of them to be available on
         # your machine, though none are defined by default because of library

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list