tfc-mirror/src/common/crypto.py

#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-

"""
TFC - Onion-routed, endpoint secure messaging system
Copyright (C) 2013-2020  Markus Ottela

This file is part of TFC.

TFC is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version.

TFC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with TFC. If not, see <https://www.gnu.org/licenses/>.

---

This module contains TFC's cryptographic functions. Most algorithms are
based on the ChaCha stream cipher by Daniel J. Bernstein (djb).

Curve448-Goldilocks
└─ X448 key exchange
ChaCha stream cipher
├─ BLAKE2b cryptographic hash function
|  └─ Argon2id password hashing function
└─ ChaCha20 stream cipher
   ├─ XChaCha20-Poly1305 AEAD (IETF variant)
   └─ Linux kernel primary DRNG
"""

import hashlib
import os

import argon2
import nacl.bindings
import nacl.exceptions
import nacl.secret
import nacl.utils

from typing import Tuple

from cryptography.hazmat.primitives                 import padding
from cryptography.hazmat.primitives.asymmetric.x448 import X448PrivateKey, X448PublicKey
from cryptography.hazmat.primitives.serialization   import Encoding, PublicFormat

from src.common.exceptions import CriticalError
from src.common.misc       import separate_header
from src.common.statics    import (ARGON2_SALT_LENGTH, BITS_PER_BYTE, BLAKE2_DIGEST_LENGTH, BLAKE2_DIGEST_LENGTH_MAX,
                                   BLAKE2_DIGEST_LENGTH_MIN, FINGERPRINT, FINGERPRINT_LENGTH, MESSAGE_KEY, HEADER_KEY,
                                   PADDING_LENGTH, SYMMETRIC_KEY_LENGTH, TFC_PUBLIC_KEY_LENGTH,
                                   X448_SHARED_SECRET_LENGTH, XCHACHA20_NONCE_LENGTH)


def blake2b(message:     bytes,                        # Message to hash
            key:         bytes = b'',                  # Key for keyed hashing
            salt:        bytes = b'',                  # Salt for randomized hashing
            person:      bytes = b'',                  # Personalization string
            digest_size: int   = BLAKE2_DIGEST_LENGTH  # Length of the digest
            ) -> bytes:                                # The BLAKE2b digest
    """Generate BLAKE2b digest (i.e. cryptographic hash) of a message.

    BLAKE2 is the successor of SHA3-finalist BLAKE*, designed by
    Jean-Philippe Aumasson, Samuel Neves, Zooko Wilcox-O'Hearn, and
    Christian Winnerlein. The hash function is based on the ChaCha
    stream cipher, designed by djb.

    * BLAKE was designed by Jean-Philippe Aumasson, Luca Henzen,
      Willi Meier, and Raphael C.-W. Phan.

    For more details, see
        https://blake2.net/
        https://tools.ietf.org/html/rfc7693.html
        https://docs.python.org/3.7/library/hashlib.html#blake2

    The reasons for using BLAKE2b in TFC include

        o BLAKE received more in-depth cryptanalysis[1] than Keccak (SHA3):

          "Keccak received a significant amount of cryptanalysis,
           although not quite the depth of analysis applied to BLAKE,
           Grøstl, or Skein."[2; p.13]

        o BLAKE shares design elements with SHA-2[3] that has 11 years
          of cryptanalysis[4] behind it.

        o 128-bit collision/preimage/second-preimage resistance against
          Grover's algorithm running on a quantum Turing machine.

        o The algorithm is bundled in Python3.7's hashlib.

        o Compared to SHA3-256, the algorithm runs faster on CPUs which
          means better hash ratchet performance:

          "The ARX-based algorithms, BLAKE and Skein, perform extremely
           well in software."[2; p.13]

        o Compared to SHA3-256, the algorithm runs slower on ASICs which
          means attacks by high-budget adversaries are slower:

          "Keccak has a clear advantage in throughput/area performance
           in hardware implementations."[2; p.13]

    Note that while the default digest length of BLAKE2b (the
    implementation optimized for AMD64 systems) is 512 bits, the digest
    length is truncated to 256 bits for the use in TFC.

    The correctness of the BLAKE2b implementation[5] is tested by TFC
    unit tests. The testing is done with the complete suite of BLAKE2b
    known answer tests (KATs).

     [1] https://blake2.net/#cr
     [2] https://nvlpubs.nist.gov/nistpubs/ir/2012/NIST.IR.7896.pdf
     [3] https://leastauthority.com/blog/BLAKE2-harder-better-faster-stronger-than-MD5/
     [4] https://en.wikipedia.org/wiki/SHA-2#Cryptanalysis_and_validation
     [5] https://github.com/python/cpython/tree/3.7/Modules/_blake2
         https://github.com/python/cpython/blob/3.7/Lib/hashlib.py
    """
    try:
        digest = hashlib.blake2b(message,
                                 digest_size=digest_size,
                                 key=key,
                                 salt=salt,
                                 person=person).digest()  # type: bytes
    except ValueError as e:
        raise CriticalError(str(e))

    if not isinstance(digest, bytes):
        raise CriticalError(f"BLAKE2b returned an invalid type ({type(digest)}) digest.")

    if len(digest) != digest_size:
        raise CriticalError(f"BLAKE2b digest had invalid length ({len(digest)} bytes).")

    return digest


def argon2_kdf(password:    str,    # Password to derive the key from
               salt:        bytes,  # Salt to derive the key from
               time_cost:   int,    # Number of iterations
               memory_cost: int,    # Amount of memory to use (in bytes)
               parallelism: int     # Number of threads to use
               ) -> bytes:          # The derived key
    """Derive an encryption key from password and salt using Argon2id.

    Argon2 is a password hashing function designed by Alex Biryukov,
    Daniel Dinu, and Dmitry Khovratovich from the University of
    Luxembourg. The algorithm is the winner of the 2015 Password Hashing
    Competition (PHC).

    For more details, see
        https://password-hashing.net/
        https://en.wikipedia.org/wiki/Argon2

    The reasons for using Argon2 in TFC include

        o PBKDF2 and bcrypt are not memory-hard, thus they are weak
          against massively parallel computing attacks with
          FPGAs/GPUs/ASICs.[1; p.2]

        o scrypt is very complex as it "combines two independent
          cryptographic primitives (the SHA256 hash function, and
          the Salsa20/8 core operation), and four generic operations
          (HMAC, PBKDF2, Block-Mix, and ROMix)."[2; p.10]
              Furthermore, scrypt is "vulnerable to trivial time-memory
          trade-off (TMTO) attacks that allows compact implementations
          with the same energy cost."[1; p.2]

        o Of all of the PHC finalists, only Catena and Argon2i offer
          complete cache-timing resistance by using data-independent
          memory access. Catena does not support parallelism[2; p.49],
          thus if it later turns out TFC needs stronger protection from
          cache-timing attacks, the selection of Argon2 (that always
          supports parallelism) is ideal, as switching from Argon2id
          to Argon2i is trivial.

        o More secure algorithms such as the Balloon hash function[3] do
          not have robust implementations.

    The purpose of Argon2 is to stretch a password into a 256-bit key.
    Argon2 features a slow, memory-hard hash function that consumes
    computational resources of an attacker that attempts a dictionary
    or a brute force attack.

    The function also takes a salt (256-bit random value in this case)
    that prevents rainbow-table attacks, and forces each attack to take
    place against an individual (physically compromised) TFC-endpoint,
    or PSK transmission media.

    The Argon2 version used is the Argon2id, that is the current
    recommendation of the draft RFC[4]. Argon2id uses data-independent
    memory access for the first half of the first iteration, and
    data-dependent memory access for the rest. This provides a lot of
    protection against TMTO attacks which is great because most of the
    expected attacks are against physically compromised data storage
    devices where the encrypted data is at rest.
        Argon2id also adds some security against side-channel attacks
    that malicious code injected to the Destination Computer might
    perform. Considering these two attacks, Argon2id is the most secure
    choice.

    The correctness of the Argon2id implementation[5] is tested by TFC
    unit tests. The testing is done by comparing the output of the
    argon2_cffi library with the output of the Argon2 reference
    command-line utility under randomized input parameters.

     [1] https://github.com/P-H-C/phc-winner-argon2/blob/master/argon2-specs.pdf
     [2] https://password-hashing.net/submissions/specs/Catena-v5.pdf
     [3] https://crypto.stanford.edu/balloon/
     [4] https://tools.ietf.org/html/draft-irtf-cfrg-argon2-06#section-9.4
     [5] https://github.com/P-H-C/phc-winner-argon2
         https://github.com/hynek/argon2_cffi
    """
    if len(salt) != ARGON2_SALT_LENGTH:
        raise CriticalError(f"Invalid salt length ({len(salt)} bytes).")

    try:
        key = argon2.low_level.hash_secret_raw(secret=password.encode(),
                                               salt=salt,
                                               time_cost=time_cost,
                                               memory_cost=memory_cost,
                                               parallelism=parallelism,
                                               hash_len=SYMMETRIC_KEY_LENGTH,
                                               type=argon2.Type.ID)  # type: bytes

    except argon2.exceptions.Argon2Error as e:
        raise CriticalError(str(e))

    if not isinstance(key, bytes):
        raise CriticalError(f"Argon2 returned an invalid type ({type(key)}) key.")

    if len(key) != SYMMETRIC_KEY_LENGTH:
        raise CriticalError(f"Derived an invalid length key from password ({len(key)} bytes).")

    return key


class X448(object):
    """\
    X448 is the Diffie-Hellman function for Curve448-Goldilocks, a
    state-of-the-art elliptical curve published by Mike Hamburg in 2014.

    For more details, see
        https://eprint.iacr.org/2015/625.pdf
        http://ed448goldilocks.sourceforge.net/
        https://en.wikipedia.org/wiki/Curve448

    The reasons for using X448 in TFC include

        o Curve448 meets the criterion for a SafeCurve[1]:

          Parameters

            - Use of large prime field (p = 2^448 - 2^224 - 1).

            - The Edwards curve (x^2+y^2 = 1-39081x^2y^2) is complete.

            - The base point (x_1,y_1) is on the curve.

          ECDLP security

            - 222.8-bit security against the Pollard's rho method.
                  This is important as the security of hash ratchet
              depends on the security of the root key. Curve25519 is
              thus less feasible choice. Curve448 is also likely to
              resist quantum computers and mathematical breakthroughs
              against ECC for a longer time.

            - Safe against additive and multiplicative transfer.

            - The complex-multiplication field discriminant is 2^447.5,
              which is much larger than the required minimum (2^100).

            - The curve-generation process is fully rigid, i.e. it has
              been completely explained. In comparison, NIST P-curves
              use coefficients generated by hashing unexplained seeds.

          ECC security

            - Use of Montgomery ladder that protects from side channel
              attacks by doing constant-time single-scalar multiplication.

            - 221.8-bit security against twist attacks (small-subgroup
              attack combined with invalid-curve attack).

            - Support for complete single-scalar and multi-scalar
              multiplication formulas.

            - Points on Curve448 (e.g. public keys) are
              indistinguishable from uniform random strings.

        o Safer curves (M-511 and E-521) do not have robust implementations.

        o NIST has announced X448 will be included in the SP 800-186.[2]

        o Its public keys do not require validation as long as the
          resulting shared secret is not zero:

          "[X448] is actually two curves, where any patterns of bits
           will be interpreted as a point on one of the curves or on the
           other."[3]

        o Its public keys are reasonably short (84 chars when WIF-encoded)
          to be manually typed from Networked Computer to Source Computer.

    The correctness of the X448 implementation[4] is tested by TFC unit
    tests. The testing is done in limited scope by using the official
    test vectors.

     [1] https://safecurves.cr.yp.to/
     [2] https://csrc.nist.gov/News/2017/Transition-Plans-for-Key-Establishment-Schemes
     [3] https://crypto.stackexchange.com/a/44348
     [4] https://github.com/openssl/openssl/tree/OpenSSL_1_1_1-stable/crypto/ec/curve448
         https://github.com/pyca/cryptography/blob/master/src/cryptography/hazmat/primitives/asymmetric/x448.py
    """

    @staticmethod
    def generate_private_key() -> 'X448PrivateKey':
        """Generate the X448 private key.

        The pyca/cryptography's key generation process is as follows:

        1. When `X448PrivateKey.generate()` is called by this method,
           the `generate()` class method imports the OpenSSL backend[1].

        2. Importing the backend causes Python to call the Backend
           class[2] that runs the `__init__()` method of the class[3],
           which then calls the `activate_osrandom_engine()` instance
           method[4].

        3. Calling the `activate_osrandom_engine()` disables the default
           OpenSSL CSPRNG, and activates the pyca/cryptography
           "OS random engine".[5]

        4. Unlike the OpenSSL user-space CSPRNG that only seeds from
           /dev/urandom, the OS random engine uses the GETRANDOM(0)
           syscall that sources all of its entropy directly from the
           LRNG's ChaCha20 DRNG. The OS random engine does not suffer
           from the fork() weakness where forked process is not
           automatically reseeded, and it's also safe from issues with
           OpenSSL CSPRNG initialization.[6]

        5. The fallback option (/dev/urandom) of OS random engine might
           be problematic on pre-3.17 kernels if the CSPRNG has not been
           properly seeded. However, TFC checks that the kernel version
           of the OS it's running on is at least 4.17. This means that
           the used source of entropy is always GETRANDOM(0).[7] This
           can be verified from the source code as well: The last
           parameter `0` of the GETRANDOM syscall[8] indicates
           GRND_NONBLOCK flag is not set. This means the ChaCha20 DRNG
           is used, and that it does not yield entropy until it has been
           fully seeded. This is the same case as with TFC's `csprng()`
           function.

         [1] https://github.com/pyca/cryptography/blob/2.7/src/cryptography/hazmat/primitives/asymmetric/x448.py#L38
         [2] https://github.com/pyca/cryptography/blob/2.7/src/cryptography/hazmat/backends/openssl/backend.py#L2445
         [3] https://github.com/pyca/cryptography/blob/2.7/src/cryptography/hazmat/backends/openssl/backend.py#L115
         [4] https://github.com/pyca/cryptography/blob/2.7/src/cryptography/hazmat/backends/openssl/backend.py#L122
         [5] https://cryptography.io/en/latest/hazmat/backends/openssl/#activate_osrandom_engine
         [6] https://cryptography.io/en/latest/hazmat/backends/openssl/#os-random-engine
         [7] https://cryptography.io/en/latest/hazmat/backends/openssl/#os-random-sources
         [8] https://github.com/pyca/cryptography/blob/master/src/_cffi_src/openssl/src/osrandom_engine.c#L391
        """
        return X448PrivateKey.generate()

    @staticmethod
    def derive_public_key(private_key: 'X448PrivateKey') -> bytes:
        """Derive public key from an X448 private key."""
        public_key = private_key.public_key().public_bytes(encoding=Encoding.Raw,
                                                           format=PublicFormat.Raw)  # type: bytes

        if not isinstance(public_key, bytes):
            raise CriticalError(f"Generated an invalid type ({type(public_key)}) public key.")

        if len(public_key) != TFC_PUBLIC_KEY_LENGTH:
            raise CriticalError(f"Generated an invalid size public key from private key ({len(public_key)} bytes).")

        return public_key

    @staticmethod
    def shared_key(private_key: 'X448PrivateKey', public_key: bytes) -> bytes:
        """Derive the X448 shared key.

        The pyca/cryptography library validates the length of the public
        key and verifies that the shared secret is not zero.

        The X448 shared secret is not a random byte string, but a random
        point on the curve. Thus, the raw bits of the shared secret
        might not be uniformly distributed in the keyspace, but have
        bias towards 0 or 1.
            To get rid of the bias, the raw shared secret is passed
        through a computational extractor (BLAKE2b CSPRF) to ensure
        a uniformly random shared key.

        While `shared secret` and `shared key` are used synonymously, in
        TFC we choose to distinguish between the raw shared secret and
        the BLAKE2b compressed shared secret by calling only the latter
        the `shared key`.

        Note that the shared key won't be used directly as a session
        key. Instead, it will be used as the key parameter in separate
        BLAKE2b instances where the hash function is used as a KDF to
        extract unidirectional message/header keys and fingerprints.
        """
        try:
            shared_secret = private_key.exchange(X448PublicKey.from_public_bytes(public_key))  # type: bytes
        except ValueError as e:
            raise CriticalError(str(e))

        if not isinstance(shared_secret, bytes):  # pragma: no cover
            raise CriticalError(f"Derived an invalid type ({type(shared_secret)}) shared secret.")

        if len(shared_secret) != X448_SHARED_SECRET_LENGTH:  # pragma: no cover
            raise CriticalError(f"Generated an invalid size shared secret ({len(shared_secret)} bytes).")

        return blake2b(shared_secret, digest_size=SYMMETRIC_KEY_LENGTH)

    @staticmethod
    def derive_keys(dh_shared_key:          bytes,
                    tfc_public_key_user:    bytes,
                    tfc_public_key_contact: bytes
                    ) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]:
        """Create domain separated message and header keys and fingerprints from shared key.

        Domain separate unidirectional keys from shared key by using public
        keys as message and the context variable as personalization string.

        Domain separate fingerprints of public keys by using the shared
        secret as key and the context variable as personalization string.
        This way entities who might monitor fingerprint verification
        channel are unable to correlate spoken values with public keys
        that they might see on RAM or screen of Networked Computer:
        Public keys can not be derived from the fingerprints due to
        preimage resistance of BLAKE2b, and fingerprints can not be
        derived from public key without the X448 shared key. Using the
        context variable ensures fingerprints are distinct from derived
        message and header keys.
        """
        tx_mk = blake2b(tfc_public_key_contact, dh_shared_key, person=MESSAGE_KEY, digest_size=SYMMETRIC_KEY_LENGTH)
        rx_mk = blake2b(tfc_public_key_user,    dh_shared_key, person=MESSAGE_KEY, digest_size=SYMMETRIC_KEY_LENGTH)

        tx_hk = blake2b(tfc_public_key_contact, dh_shared_key, person=HEADER_KEY,  digest_size=SYMMETRIC_KEY_LENGTH)
        rx_hk = blake2b(tfc_public_key_user,    dh_shared_key, person=HEADER_KEY,  digest_size=SYMMETRIC_KEY_LENGTH)

        tx_fp = blake2b(tfc_public_key_user,    dh_shared_key, person=FINGERPRINT, digest_size=FINGERPRINT_LENGTH)
        rx_fp = blake2b(tfc_public_key_contact, dh_shared_key, person=FINGERPRINT, digest_size=FINGERPRINT_LENGTH)

        key_tuple = tx_mk, rx_mk, tx_hk, rx_hk, tx_fp, rx_fp

        if len(set(key_tuple)) != len(key_tuple):
            raise CriticalError("Derived keys were not unique.")

        return key_tuple


def encrypt_and_sign(plaintext: bytes,       # Plaintext to encrypt
                     key:       bytes,       # 32-byte symmetric key
                     ad:        bytes = b''  # Associated data
                     ) -> bytes:             # Nonce + ciphertext + tag
    """Encrypt plaintext with XChaCha20-Poly1305 (IETF variant).

    ChaCha20 is a stream cipher published by Daniel J. Bernstein (djb)
    in 2008. The algorithm is an improved version of Salsa20 -- another
    stream cipher by djb -- selected by ECRYPT into the eSTREAM
    portfolio in 2008. The improvement in question is, ChaCha20
    increases the per-round diffusion compared to Salsa20 while
    maintaining or increasing speed.

    For more details, see
        https://cr.yp.to/chacha.html
        https://cr.yp.to/snuffle.html
        https://cr.yp.to/snuffle/security.pdf
        https://en.wikipedia.org/wiki/Salsa20#ChaCha_variant

    The Poly1305 is a Wegman-Carter message authentication code (MAC)
    also designed by djb. The MAC is provably secure if ChaCha20 is
    secure. The 128-bit tag space ensures the attacker's advantage to
    create an existential forgery is negligible.

    For more details, see
        https://cr.yp.to/mac.html

    The version used in TFC is the XChaCha20-Poly1305-IETF[1], a variant
    of the ChaCha20-Poly1305-IETF (RFC 8439[2]). Quoting libsodium, the
    XChaCha20 (=eXtended-nonce ChaCha20) variant allows encryption of
    ~2^64 bytes per message, encryption of practically unlimited number
    of messages, and safe use of random nonces due to the 192-bit nonce
    space[3].

    The reasons for using XChaCha20-Poly1305 in TFC include

        o Conservative 256-bit key size[4] that matches the 222.8-bit
          security of X448, and BLAKE2b (with truncated, 256-bit hashes).

        o The Salsa20 algorithm has 14 years of cryptanalysis behind it[5]
          and ChaCha20 has resisted cryptanalysis as well[6][7]. Currently
          the best public attack[8] breaks ChaCha7 in 2^233 operations.

        o Security against differential and linear cryptanalysis.[9][10]

        o Security against cache-timing attacks on all CPUs (unlike AES
          on CPUs without AES-NI).[11; p.2]

        o The increased diffusion over the well-received Salsa20.[12]

        o The algorithm is much faster compared to AES (in cases where
          the CPU and/or implementation does not support AES-NI).[12]

        o The good name of djb.[13]

    The correctness of the XChaCha20-Poly1305 implementation[14] is
    tested by TFC unit tests. The testing is done in limited scope by
    using the libsodium and official IETF test vectors.

      [1] https://tools.ietf.org/html/draft-irtf-cfrg-xchacha-01
      [2] https://tools.ietf.org/html/rfc8439
      [3] https://download.libsodium.org/doc/secret-key_cryptography/aead/chacha20-poly1305/xchacha20-poly1305_construction
      [4] https://cr.yp.to/snuffle/keysizes.pdf
      [5] https://en.wikipedia.org/wiki/Salsa20#Cryptanalysis_of_Salsa20
      [6] https://eprint.iacr.org/2007/472.pdf
      [7] https://eprint.iacr.org/2015/698.pdf
      [8] https://eprint.iacr.org/2016/377.pdf
      [9] https://www.cryptrec.go.jp/exreport/cryptrec-ex-2601-2016.pdf
     [10] https://eprint.iacr.org/2013/328.pdf
     [11] https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
     [12] https://cr.yp.to/chacha/chacha-20080128.pdf
     [13] https://www.eff.org/sv/deeplinks/2015/04/remembering-case-established-code-speech
     [14] https://github.com/jedisct1/libsodium/blob/master/src/libsodium/crypto_core/hchacha20/core_hchacha20.c
          https://github.com/jedisct1/libsodium/blob/master/src/libsodium/crypto_aead/xchacha20poly1305/sodium/aead_xchacha20poly1305.c
          https://github.com/pyca/pynacl/blob/master/src/nacl/bindings/crypto_aead.py#L349
    """
    if len(key) != SYMMETRIC_KEY_LENGTH:
        raise CriticalError(f"Invalid key length ({len(key)} bytes).")

    nonce = csprng(XCHACHA20_NONCE_LENGTH)

    try:
        ct_tag = nacl.bindings.crypto_aead_xchacha20poly1305_ietf_encrypt(plaintext, ad, nonce, key)  # type: bytes
    except nacl.exceptions.CryptoError as e:
        raise CriticalError(str(e))

    return nonce + ct_tag


def auth_and_decrypt(nonce_ct_tag: bytes,       # Nonce + ciphertext + tag
                     key:          bytes,       # 32-byte symmetric key
                     database:     str   = '',  # When provided, gracefully exits TFC when the tag is invalid
                     ad:           bytes = b''  # Associated data
                     ) -> bytes:                # Plaintext
    """Authenticate and decrypt XChaCha20-Poly1305 ciphertext.

    The Poly1305 tag is checked using constant time `sodium_memcmp`:
        https://download.libsodium.org/doc/helpers#constant-time-test-for-equality

    When TFC decrypts ciphertext from an untrusted source (i.e., a
    contact), no `database` parameter is provided. In such a situation,
    if the tag of the untrusted ciphertext is invalid, TFC discards the
    ciphertext and recovers appropriately.

    When TFC decrypts ciphertext from a trusted source (i.e., a
    database), the `database` parameter is provided, so the function
    knows which database is in question. In case the authentication
    fails due to invalid tag, the data is assumed to be either tampered
    with, or corrupted. TFC will in such a case gracefully exit to avoid
    processing the unsafe data and warn the user in which database the
    issue was detected.
    """
    if len(key) != SYMMETRIC_KEY_LENGTH:
        raise CriticalError(f"Invalid key length ({len(key)} bytes).")

    nonce, ct_tag = separate_header(nonce_ct_tag, XCHACHA20_NONCE_LENGTH)

    try:
        plaintext = nacl.bindings.crypto_aead_xchacha20poly1305_ietf_decrypt(ct_tag, ad, nonce, key)  # type: bytes
        return plaintext
    except nacl.exceptions.CryptoError:
        if database:
            raise CriticalError(f"Authentication of data in database '{database}' failed.")
        raise


def byte_padding(bytestring: bytes  # Bytestring to be padded
                 ) -> bytes:        # Padded bytestring
    """Pad bytestring to next 255 bytes.

    TFC adds padding to messages it outputs. The padding ensures each
    assembly packet has a constant length. When traffic masking is
    disabled, because of padding the packet length reveals only the
    maximum length of the compressed message.

    When traffic masking is enabled, the padding contributes to traffic
    flow confidentiality: During traffic masking, TFC will output a
    constant stream of padded packets at constant intervals that hides
    metadata about message length (i.e., the adversary won't be able to
    distinguish when transmission of packet or series of packets starts
    and stops), as well as the type (message/file) of transferred data.

    TFC uses the PKCS #7 padding scheme described in RFC 2315 and RFC 5652:
        https://tools.ietf.org/html/rfc2315#section-10.3
        https://tools.ietf.org/html/rfc5652#section-6.3

    For a better explanation, see
        https://en.wikipedia.org/wiki/Padding_(cryptography)#PKCS#5_and_PKCS#7
    """
    padder  = padding.PKCS7(PADDING_LENGTH * BITS_PER_BYTE).padder()
    padded  = padder.update(bytestring)  # type: bytes
    padded += padder.finalize()

    if not isinstance(padded, bytes):
        raise CriticalError(f"Padded message had invalid type ({type(padded)}).")

    if len(padded) % PADDING_LENGTH != 0:
        raise CriticalError(f"Padded message had an invalid length ({len(padded)}).")

    return padded


def rm_padding_bytes(bytestring: bytes  # Padded bytestring
                     ) -> bytes:        # Bytestring without padding
    """Remove padding from plaintext.

    The length of padding is determined by the ord-value of the last
    byte that is always part of the padding.
    """
    unpadder  = padding.PKCS7(PADDING_LENGTH * BITS_PER_BYTE).unpadder()
    unpadded  = unpadder.update(bytestring)  # type: bytes
    unpadded += unpadder.finalize()

    return unpadded


def csprng(key_length: int = SYMMETRIC_KEY_LENGTH  # Length of the key
           ) -> bytes:                             # The generated key
    """Generate a cryptographically secure random key.

    The default key length is 32 bytes (256 bits).

    The key is generated by the Linux kernel's modern, ChaCha20-based
    cryptographically secure pseudo-random number generator (CSPRNG),
    also known as the Linux-RNG, or LRNG.

    For more details, see
        https://www.2uo.de/myths-about-urandom/
        https://www.chronox.de/lrng/doc/lrng.pdf
        https://www.bsi.bund.de/SharedDocs/Downloads/EN/BSI/Publications/Studies/LinuxRNG/LinuxRNG_EN.pdf
        https://github.com/torvalds/linux/blob/master/drivers/char/random.c


    TFC key generation overview
    ===========================

    The following schematic of the LRNG and its relation to TFC is based
    on [1; p.19]. (Note: the page number for the BSI report is always
    the PDF page number, not the printed page number in the bottom
    margin of each page. This makes searching of the citations faster.)

                X448 private keys          Other TFC keys
                        ↑                         ↑
                 OS random engine          BLAKE2b (by TFC)
                        ↑                         ↑
                        └────────┐       ┌────────┘
                                GETRANDOM()
                                     ↑
                                     |  ┌────────┐
                                     |  |        | State transition
                             ┏━━━━━━━━━━━━━━━┓   |
                             ┃ ChaCha20 DRNG ┃<──┘
                             ┗━━━━━━━━━━━━━━━┛
                                     ↑
                                   fold
                                     ↑
                                   SHA-1 ────────┐
                                     ↑           | State transition
                             ┏━━━━━━━━━━━━━━┓    |
                             ┃  input_pool  ┃<───┘
                             ┗━━━━━━━━━━━━━━┛<─────────────────────┐
                               ↑ ↑       ↑                         |
          ┌────────────────────┘ |    ┏━━━━━━━━━━━━━━━┓            |
          |              ┌───────┘    ┃ Time variance ┃      ┏━━━━━━━━━━━┓
          |              |            ┃  calculation  ┃      ┃ fast_pool ┃
          |              |            ┗━━━━━━━━━━━━━━━┛      ┗━━━━━━━━━━━┛
          |              |                ↑       ↑                ↑
    ┏━━━━━━━━━━━┓┏━━━━━━━━━━━━━━━┓┏━━━━━━━━━━━┓┏━━━━━━━━━━━┓┏━━━━━━━━━━━━━┓
    ┃add_device ┃┃add_hwgenerator┃┃ add_input ┃┃ add_disk  ┃┃add_interrupt┃
    ┃_randomness┃┃  _randomness  ┃┃_randomness┃┃_randomness┃┃ _randomness ┃
    ┗━━━━━━━━━━━┛┗━━━━━━━━━━━━━━━┛┗━━━━━━━━━━━┛┗━━━━━━━━━━━┛┗━━━━━━━━━━━━━┛

     [1] https://www.bsi.bund.de/SharedDocs/Downloads/EN/BSI/Publications/Studies/LinuxRNG/LinuxRNG_EN.pdf


    Entropy sources
    ===============

    The APIs for the raw entropy sources of the LRNG include

        o add_device_randomness: Device driver related data believed to
          provide entropy. The device driver specific value is mixed
          into the unseeded ChaCha20 DRNG and input_pool during boot
          along with the high-resolution time stamp, XORed with the
          Jiffies (Linux kernel timer). The value is requested only
          once, and it is not considered to contain any entropy.[1; p.52]

        o add_hwgenerator_randomness: HWRNGs supported by the Linux
          kernel, if available. The output of the HWRNG device is used
          to seed the ChaCha20 DRNG if needed, and then to seed the
          input_pool directly when the entropy estimator's value falls
          below the set threshold. (CPU HWRNG is not processed by the
          add_hwgenerator_randomness service function).[1; pp.52-54]

        o add_input_randomness: Key presses, mouse movements, mouse
          button presses etc. Repeated event values (e.g. key presses or
          same direction mouse movements) are ignored by the service
          function.[1; p.44]
              The event data consists of four LSBs of the event type,
          four MSBs of the event code, the event code itself, and the
          event value, all XORed together.[1; p.45]
              The resulting event data is fed into the input_pool via
          add_timer_randomness, which prepends to the event value the
          32 LSBs of a high-resolution timestamp, plus the 64-bit
          Jiffies timestamp.[1; p.55]
              Each HID event contains 15.6 bits of Shannon entropy, but
          due to LRNG's conservative heuristic entropy estimation, on
          average only 1.29 bits of entropy is awarded to the event.
          [1; p.77]

        o add_disk_randomness: Hardware events of block devices, e.g.
          HDDs (but not e.g. SSDs). When a disk event occurs, the block
          device number as well as the timer state variable disk->random
          is mixed into the input_pool via add_timer_randomness.
          [1; pp.50-51]
              Each disk event contains on average 17.7 bits of Shannon
          entropy, but only 0.21 bits of entropy is awarded to the
          event.[1; p.77]

        o add_interrupt_randomness: Interrupts (i.e. signals from SW/HW
          to processor that an event needs immediate attention) occur
          hundreds of thousands of times per second under average load.
              The interrupt timestamps and event data are mixed into
          128-bit, per-CPU pool called fast_pool. When an interrupt
          occurs
            * The 32 LSBs of the high-resolution timestamp, the coarse
              Jiffies, and the interrupt number are XORed with the first
              32-bit word of the fast_pool.
            * The 32 LSBs of the Jiffies and the 32 MSBs of the
              high-resolution timestamp are XORed with the second word
              of the fast_pool.
            * The 32 MSBs and LSBs of the 64-bit CPU instruction pointer
              value are XORed with the third and fourth word of the
              fast_pool. If no pointer is available, the XORed value is
              instead the return address of the add_interrupt_randomness
              function.
          The raw entropy mixed into the fast_pool is then distributed
          more evenly with a function called fast_mix.
              The content of the fast_pool is mixed into the input_pool
          once it has data about at least 64 interrupt events, and
          (unless the ChaCha20 DRNG is being seeded) at least one second
          has passed since the fast_pool was last mixed in. The counter
          keeping track of the interrupt events is then zeroed.
          [1; pp.45-49]
              Each interrupt is assumed to contain 1/32 bits of entropy.
          However, the measured Shannon entropy for each interrupt is
          19.2 bits, which means each 128-bit fast_pool is fed 1228.8
          bits of Shannon entropy.[1; p.77]
              The entire content of the fast_pool is considered to
          increase the internal entropy of the input_pool by 1 bit. If
          the RDSEED (explained below) instruction is available, it is
          used to obtain a 64-bit value that is also mixed into the
          input_pool, and the internal entropy of the input_pool is
          then considered to have increased by another bit.[1; p.48]

    Additional raw entropy sources include

        o RDSEED/RDRAND CPU instructions:
            - Intel: A pair of inverters[2] feeds 512 bits of raw
                     entropy to AES256-CBC-MAC based conditioner (as
                     specified in NIST SP 800-38A), that can be
                     requested bytes with the RDSEED instruction.
                         The conditioner is used to create 256-bit seeds
                     for the AES256-CTR based DRBG available via the
                     RDRAND instruction. The DRBG is reseeded after
                     every 511th sample of 128 bits (~8kB).[3; p.12]

            - AMD:   A set of 16 ring oscillator chains feeds 512 bits
                     of raw entropy to AES256-CBC-MAC based conditioner
                     again available via RDSEED instruction. The
                     conditioner is used to produce 128-bit seeds --
                     a process that is repeated thrice to create a
                     384-bit seed for the AES256-CTR based DRBG
                     available via the RDRAND instruction. The DRBG is
                     reseeded at least every 2048 queries of 32-bits
                     (8kB).[4; pp.2-3]

          While the RDSEED/RDRAND instructions are used extensively,
          because the CPU HWRNG is not an auditable source, it is
          assumed to provide only a very small amount of entropy.
          [1; p.83]

        o Data written to /dev/(u)random from the user space[1; p.38]
          such as the 4096-bit random-seed that was obtained from the
          ChaCha20 DRNG and written on disk when the previous session
          ended and the system was powered off.[1; p.63]
              While the random-seed might not be mixed in early enough
          during boot to benefit the kernel[5], it is mixed into the
          input_pool before TFC starts.

        o User space IOCTL of RNDADDENTROPY.[1; p.39]

     [1] https://www.bsi.bund.de/SharedDocs/Downloads/EN/BSI/Publications/Studies/LinuxRNG/LinuxRNG_EN.pdf
     [2] https://spectrum.ieee.org/computing/hardware/behind-intels-new-randomnumber-generator
     [3] https://software.intel.com/sites/default/files/managed/98/4a/DRNG_Software_Implementation_Guide_2.1.pdf
     [4] https://www.amd.com/system/files/TechDocs/amd-random-number-generator.pdf
     [5] https://security.stackexchange.com/q/183506


    The input_pool
    ==============

    Overview
    --------
    The input_pool is the 4096-bit primary entropy pool of the LRNG that
    compresses truly random events from different noise sources.[1; p.19]
    Together the noise sources and the input_pool form a constantly
    seeded, non-deterministic random number generator (NDRNG), that
    seeds the ChaCha20-based deterministic random number generator
    (DRNG).[1; p.20]

    Initialization of the input_pool
    --------------------------------
    The input_pool is initialized during boot time of the kernel by
    mixing following data into the entropy pool:
        1. The current time with nanosecond precision (64-bit CPUs).
        2. Entropy obtained from CPU HWRNG via RDRAND instruction, if
           available.
        3. System specific information such as OS name, release,
           version, and a HW identifier.[1; pp.30-31]

    Initial seeding and seeding levels of the input_pool
    ----------------------------------------------------
    After a hardware event has occurred, the entropy of the event value
    is estimated, and both values are mixed into the input_pool using a
    function based on a linear feedback shift register (LFSR), one byte
    at a time.[1; p.23]

    The input_pool only keeps track if at one point it has had 128 bits
    of entropy in it. When that limit is exceeded, the variable
    `initialized` is set to one.[1; p.22] This level of entropy is
    reached at early boot phase (by the time the user space boots).
    [2; p.6]
        Once the input_pool is initialized, the ChaCha20 DRNG is
    reseeded from the input_pool[2] using 128..256 bits of entropy
    [1; pp.27-28] from the input_pool and at that point the DRNG is
    considered fully seeded.[3]

    State transition and output of the input_pool
    ---------------------------------------------
    When outputting entropy from the input_pool to the ChaCha20 DRNG,
    the input_pool output function first compresses the entire content
    of the input_pool with SHA-1 like hash function that has the
    transformation function of SHA-1, but that replaces the constants of
    SHA-1 with random values obtained from CPU HWRNG via RDRAND, if
    available.[1; p.29]
        The output function also "folds" the 160-bit digest by slicing
    it into two 80-bit chunks and by then XORing them together to
    produce the final output. At the same time, the output function
    reduces the input_pool entropy estimator by 80 bits.[1; p.18]
        The "SHA-1" digest is mixed back into the input_pool using the
    LFSR-based state transition function to provide backtracking
    resistance.[1; p.18]
        If more than 80-bits of entropy is requested, the
    hash-fold-yield-mix-back operation is repeated until the requested
    number of bytes are generated. (Reseeding the ChaCha20 DRNG requires
    four consecutive requests.)[1; p.18]

    Reseeding of the input_pool
    ---------------------------
    The input_pool is reseeded constantly as random events occur. The
    events are mixed with the LFSR, one byte at a time.
        When the input_pool is full, more entropy keeps getting mixed in
    which is helpful in case the entropy estimator is optimistic: At
    some point the entropy will have reached the maximum of 4096 bits.
        When the input_pool entropy estimator considers the pool to have
    4096 bits of entropy, it will output 1024 bits to blocking_pool for
    the use of /dev/random, and it will then reduce the input_pool's
    entropy estimator by 1024 bits.[1; pp.59-60]

     [1] https://www.bsi.bund.de/SharedDocs/Downloads/EN/BSI/Publications/Studies/LinuxRNG/LinuxRNG_EN.pdf
     [2] https://github.com/torvalds/linux/blob/master/drivers/char/random.c#L791
     [3] https://github.com/torvalds/linux/blob/master/drivers/char/random.c#L1032

    The ChaCha20 DRNG
    =================

    Overview
    --------
    The LRNG uses the ChaCha20 stream cipher as its primary DRNG.

    The internal 64-byte state of the DRNG consists of
        - 16-byte constant b'Expand 32-byte k' set by the designer (djb)[1; p.32]
        - 32-byte key (the only part that is reseeded with entropy)
        -  4-byte counter (the counter is actually 64-bits[2])
        - 12-byte nonce

    In addition, the DRNG state contains a 4-byte timestamp called
    init_time, that keeps track of when the DRNG was last seeded.
    [1; pp.32-33]

    Initialization of the DRNG
    --------------------------
    The ChaCha20 DRNG is initialized during the boot time of the kernel
    by using the content of the input_pool (considered to have poor
    entropy at this point[1; p.32]) for key, counter, and nonce parts
    of the DRNG state.
        Each of the three values is XORed with the output from CPU
    HWRNG obtained via RDSEED or RDRAND instruction (if available --
    otherwise only the key is XORed, and that's done with a timestamp
    obtained via the RDTSCP instruction).[1; pp.32-33]
        The initialization is completed by setting the init_time to
    a value that causes the ChaCha20 DRNG to reseed from the input_pool
    the next time it's called.[1; p.33][3]

    Initial seeding and seeding levels of the DRNG
    ----------------------------------------------
    If the RDSEED or RDRAND is available during initialization, and if
    the CPU HWRNG is trusted by the kernel, the DRNG is seeded by the
    CPU HWRNG, after which it is considered fully seeded and the seeding
    steps below are skipped. The DRNG will still reseed from the
    input_pool the next time it is called.[1; p.35]

    **Initially seeded state**
    During initialization time of the kernel, the kernel injects four
    sets of data from fast_pool into the DRNG (instead of the
    input_pool). Each set contains event data and timestamps of 64
    interrupt events from add_interrupt_randomness.[1; p.35]
        In addition, all content from the add_device_randomness source
    is mixed into the DRNG key state using an LFSR with a period of 255.
    [1; p.52]
        Once the entropy sources have been mixed in, the DRNG is
    considered to be initially seeded.[1; p.35]

    **Fully seeded state**
    As of Linux kernel 4.17, if the CPU HWRNG is not trusted, the DRNG
    is considered fully seeded (256-bit entropy) only after, during
    initialization time of the kernel, the input_pool has reached
    128-bit entropy, and the DRNG is reseeded by XORing 128..256 bits
    from the input_pool with the key part of the DRNG state.[1; p.138]
        The time to reach this state might take up to 90 seconds
    [1; p.70], but as the installation of TFC via Tor takes longer than
    that, the DRNG is most likely fully seeded by the time TFC generates
    keys and no blocking affects the user experience.
        According to [1; p.39] and [2; p.11], the ChaCha20 DRNG blocks
    until it is fully seeded. This means TFC's key generation also
    blocks until the ChaCha20 DRNG is fully seeded.

    State transition and output of the DRNG
    ---------------------------------------
    When outputting from ChaCha20 DRNG to the caller, the ChaCha20 block
    function is invoked repeatedly until the requested number of bytes
    are generated. Each invoke yields a 64-byte output block that is
    essentially part of the keystream that in the context of stream
    cipher would be XORed with the plaintext to produce the ciphertext.
        With each generated block the internal 32-bit counter value of
    the ChaCha20 state is incremented by one to ensure unique blocks.[4]
    The state of the DRNG is further stirred by XORing the second 32-bit
    word of the nonce with the output from RDRAND instruction, if
    available.[1; p.33]
        Once the amount of requested random data has been generated, the
    state update function is invoked, which takes a 256-bit block of
    unused keystream and XORs it with the key part of the ChaCha20 state
    to ensure backtracking resistance.[1; pp.33-34]

    The random bytes used in TFC are obtained with the GETRANDOM syscall
    instead of the /dev/urandom device file. This has two major benefits:
        1. It bypasses the Linux kernel's virtual file system (VFS)
           layer, which reduces complexity and possibility of bugs, and
        2. unlike /dev/urandom, GETRANDOM(0) blocks until it has been
           fully seeded.
    [1; pp.39-40]

    Reseeding of the DRNG
    ---------------------
    The ChaCha20 DRNG is reseeded automatically every 300 seconds
    irrespective of the amount of data produced by the DRNG[1; p.32].
        The DRNG is reseeded by obtaining 128..256 bits of entropy
    from the input_pool. In the order of preference, the entropy from
    the input_pool is XORed with the output of
        1. 32-byte value obtained via RDSEED CPU instruction, or
        2. 32-byte value obtained via RDRAND CPU instruction, or
        3. eight 4-byte high-resolution time stamps
    The result is then XORed with the key component of the DRNG state
    [1; p.34].

     [1] https://www.bsi.bund.de/SharedDocs/Downloads/EN/BSI/Publications/Studies/LinuxRNG/LinuxRNG_EN.pdf
     [2] https://lkml.org/lkml/2019/5/30/867
     [3] https://github.com/torvalds/linux/blob/master/drivers/char/random.c#L889
         https://github.com/torvalds/linux/blob/master/drivers/char/random.c#L1058
     [4] https://github.com/torvalds/linux/blob/master/lib/chacha.c#L87
         https://github.com/torvalds/linux/blob/master/drivers/char/random.c#L1064


    GETRANDOM and Python
    ====================

    Since Python 3.6.0, `os.urandom` has been a wrapper for the best
    available CSPRNG. The 3.17 and earlier versions of the Linux kernel
    do not support the GETRANDOM call, and Python 3.7's `os.urandom`
    will in those cases fall back to non-blocking `/dev/urandom` that is
    not secure on live distros as they have low entropy at the start of
    the session.
        To avoid possibly unsafe key generation, instead of `os.urandom`
    TFC uses the `os.getrandom(size, flags=0)` explicitly. This forces
    use of recent enough Python interpreter (3.6.0 or later) and limits
    the Linux kernel version to 3.17 or newer. To make use of the LRNG,
    the kernel version required by TFC is bumped to 4.8, and to make
    sure the ChaCha20 DRNG is always seeded from input_pool before its
    considered fully seeded, the final minimum requirement is 4.17).
        The flag 0 means GETRANDOM will block if the DRNG is not fully
    seeded.[1]

    Quoting PEP 524 [2]:
        "The os.getrandom() is a thin wrapper on the getrandom()
         syscall/C function and so inherit of its behaviour. For
         example, on Linux, it can return less bytes than
         requested if the syscall is interrupted by a signal."

    However, quoting LWN[3] on GETRANDOM:
        "--reads of 256 bytes or less from /dev/urandom are guaranteed to
         return the full request once that device has been initialized."

    Since the largest key generated in TFC is the 56-byte X448 private
    key, GETRANDOM is guaranteed to always return enough bytes. As a
    good practice however, TFC checks that the length of the obtained
    entropy is correct.

     [1] https://manpages.debian.org/testing/manpages-dev/getrandom.2.en.html
     [2] https://www.python.org/dev/peps/pep-0524/
     [3] https://lwn.net/Articles/606141/


    BLAKE2 compression
    ==================

    The output of GETRANDOM is further compressed with BLAKE2b. The
    preimage resistance of the hash function protects the internal
    state of the entropy pool just in case some user decides to modify
    the source to accept pre-4.8 Linux kernel that has no backtracking
    resistance. Another reason for the hashing is its recommended by
    djb[1].

    Since BLAKE2b only produces 1..64 byte digests[2], its use limits
    the size of the generated keys to 64 bytes. This is not a problem
    for TFC because again, the largest key it generates is the 56-byte
    X448 private key. However, because pyca/cryptography manages the
    X448 private key generation, the largest key this function will
    generate is a 32-byte symmetric key.

     [1] https://media.ccc.de/v/32c3-7210-pqchacks#video&t=1116
     [2] https://blake2.net/
    """
    if key_length < BLAKE2_DIGEST_LENGTH_MIN or key_length > BLAKE2_DIGEST_LENGTH_MAX:
        raise CriticalError(f"Invalid key size ({key_length} bytes).")

    entropy = os.getrandom(key_length, flags=0)  # type: bytes

    if not isinstance(entropy, bytes):
        raise CriticalError(f"GETRANDOM returned invalid type data ({type(entropy)}).")

    if len(entropy) != key_length:
        raise CriticalError(f"GETRANDOM returned invalid amount of entropy ({len(entropy)} bytes).")

    compressed = blake2b(entropy, digest_size=key_length)

    return compressed


def check_kernel_version() -> None:
    """Check that the Linux kernel version is at least 4.17.

    This check ensures that TFC only runs on versions of Linux kernel
    that use the new ChaCha20 DRNG (4.8 or newer) that among many
    things, adds backtracking protection.[1]

    In addition, the requirement for 4.17 ensures that the ChaCha20 DRNG
    is considered fully seeded only after it has also been seeded by the
    input_pool, not just fast_pool (assuming that the CPU HWRNG isn't
    trusted).[2; p.138]

     [1] https://lkml.org/lkml/2016/7/25/43
     [2] https://www.bsi.bund.de/SharedDocs/Downloads/EN/BSI/Publications/Studies/LinuxRNG/LinuxRNG_EN.pdf
    """
    major_v, minor_v = [int(i) for i in os.uname()[2].split('.')[:2]]  # type: int, int

    if major_v < 4 or (major_v == 4 and minor_v < 17):
        raise CriticalError("Insecure kernel CSPRNG version detected.")