Test blake3

Add blake 3 and skip some unit tests
Fix a PR comment
2025-11-04 19:25:51 +00:00 · 2025-06-26 13:27:55 +01:00 · 2025-06-25 15:01:44 +01:00 · 2025-06-25 11:30:02 +01:00 · 2025-06-25 11:28:04 +01:00 · 2025-06-25 10:38:28 +01:00
1306 changed files with 109391 additions and 29932 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -44,6 +44,7 @@ DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 ForEachMacros:   [ Q_FOREACH,  BOOST_FOREACH ]
+IncludeBlocks:   Regroup
 IncludeCategories:
  - Regex:           '^<(test)/'
    Priority:        0
@@ -53,8 +54,12 @@ IncludeCategories:
    Priority:        2
  - Regex:           '^<(boost)/'
    Priority:        3
-  - Regex:           '.*'
+  - Regex:           '^.*/'
    Priority:        4
+  - Regex:           '^.*\.h'
+    Priority:        5
+  - Regex:           '.*'
+    Priority:        6
 IncludeIsMainRegex: '$'
 IndentCaseLabels: true
 IndentFunctionDeclarationAfterType: false
@@ -89,3 +94,4 @@ SpacesInSquareBrackets: false
 Standard:        Cpp11
 TabWidth:        8
 UseTab:          Never
+QualifierAlignment: Right
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -7,13 +7,13 @@ comment:
  show_carryforward_flags: false

 coverage:
-  range: "60..80"
+  range: "70..85"
  precision: 1
  round: nearest
  status:
    project:
      default:
-        target: 60%
+        target: 75%
        threshold: 2%
    patch:
      default:
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,8 @@
+# Allow anyone to review any change by default.
+*
+
+# Require the rpc-reviewers team to review changes to the rpc code.
+include/xrpl/protocol/ @xrplf/rpc-reviewers
+src/libxrpl/protocol/ @xrplf/rpc-reviewers
+src/xrpld/rpc/ @xrplf/rpc-reviewers
+src/xrpld/app/misc/ @xrplf/rpc-reviewers
--- a/.github/actions/dependencies/action.yml
+++ b/.github/actions/dependencies/action.yml
@@ -14,8 +14,9 @@ runs:
      run: |
        conan config set general.revisions_enabled=1
        conan export external/snappy snappy/1.1.10@
-        conan export external/rocksdb rocksdb/6.29.5@
+        conan export external/rocksdb rocksdb/9.7.3@
        conan export external/soci soci/4.0.3@
+        conan export external/nudb nudb/2.0.8@
    - name: add Ripple Conan remote
      shell: bash
      run: |
@@ -54,7 +55,3 @@ runs:
          --options xrpld=True \
          --settings build_type=${{ inputs.configuration }} \
          ..
-    - name: upload dependencies to remote
-      if: (steps.binaries.outputs.missing != '[]') && (steps.remote.outputs.outcome == 'success')
-      shell: bash
-      run: conan upload --remote ripple '*' --all --parallel --confirm
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -1,59 +1,63 @@
 name: clang-format

-on: [push, pull_request]
+on:
+  push:
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]

 jobs:
  check:
+    if: ${{ github.event_name == 'push' || github.event.pull_request.draft != true || contains(github.event.pull_request.labels.*.name, 'DraftRunCI') }}
    runs-on: ubuntu-24.04
    env:
      CLANG_VERSION: 18
    steps:
-    - uses: actions/checkout@v4
-    - name: Install clang-format
-      run: |
-        codename=$( lsb_release --codename --short )
-        sudo tee /etc/apt/sources.list.d/llvm.list >/dev/null <<EOF
-        deb http://apt.llvm.org/${codename}/ llvm-toolchain-${codename}-${CLANG_VERSION} main
-        deb-src http://apt.llvm.org/${codename}/ llvm-toolchain-${codename}-${CLANG_VERSION} main
-        EOF
-        wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add
-        sudo apt-get update
-        sudo apt-get install clang-format-${CLANG_VERSION}
-    - name: Format first-party sources
-      run: find include src -type f \( -name '*.cpp' -o -name '*.hpp' -o -name '*.h' -o -name '*.ipp' \) -exec clang-format-${CLANG_VERSION} -i {} +
-    - name: Check for differences
-      id: assert
-      run: |
-        set -o pipefail
-        git diff --exit-code | tee "clang-format.patch"
-    - name: Upload patch
-      if: failure() && steps.assert.outcome == 'failure'
-      uses: actions/upload-artifact@v3
-      continue-on-error: true
-      with:
-        name: clang-format.patch
-        if-no-files-found: ignore
-        path: clang-format.patch
-    - name: What happened?
-      if: failure() && steps.assert.outcome == 'failure'
-      env:
-        PREAMBLE: |
-          If you are reading this, you are looking at a failed Github Actions
-          job.  That means you pushed one or more files that did not conform
-          to the formatting specified in .clang-format. That may be because
-          you neglected to run 'git clang-format' or 'clang-format' before
-          committing, or that your version of clang-format has an
-          incompatibility with the one on this
-          machine, which is:
-        SUGGESTION: |
+      - uses: actions/checkout@v4
+      - name: Install clang-format
+        run: |
+          codename=$( lsb_release --codename --short )
+          sudo tee /etc/apt/sources.list.d/llvm.list >/dev/null <<EOF
+          deb http://apt.llvm.org/${codename}/ llvm-toolchain-${codename}-${CLANG_VERSION} main
+          deb-src http://apt.llvm.org/${codename}/ llvm-toolchain-${codename}-${CLANG_VERSION} main
+          EOF
+          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add
+          sudo apt-get update
+          sudo apt-get install clang-format-${CLANG_VERSION}
+      - name: Format first-party sources
+        run: find include src tests -type f \( -name '*.cpp' -o -name '*.hpp' -o -name '*.h' -o -name '*.ipp' \) -exec clang-format-${CLANG_VERSION} -i {} +
+      - name: Check for differences
+        id: assert
+        run: |
+          set -o pipefail
+          git diff --exit-code | tee "clang-format.patch"
+      - name: Upload patch
+        if: failure() && steps.assert.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        continue-on-error: true
+        with:
+          name: clang-format.patch
+          if-no-files-found: ignore
+          path: clang-format.patch
+      - name: What happened?
+        if: failure() && steps.assert.outcome == 'failure'
+        env:
+          PREAMBLE: |
+            If you are reading this, you are looking at a failed Github Actions
+            job.  That means you pushed one or more files that did not conform
+            to the formatting specified in .clang-format. That may be because
+            you neglected to run 'git clang-format' or 'clang-format' before
+            committing, or that your version of clang-format has an
+            incompatibility with the one on this
+            machine, which is:
+          SUGGESTION: |

-          To fix it, you can do one of two things:
-          1. Download and apply the patch generated as an artifact of this
-             job to your repo, commit, and push.
-          2. Run 'git-clang-format --extensions cpp,h,hpp,ipp develop'
-             in your repo, commit, and push.
-      run: |
-        echo "${PREAMBLE}"
-        clang-format-${CLANG_VERSION} --version
-        echo "${SUGGESTION}"
-        exit 1
+            To fix it, you can do one of two things:
+            1. Download and apply the patch generated as an artifact of this
+               job to your repo, commit, and push.
+            2. Run 'git-clang-format --extensions cpp,h,hpp,ipp develop'
+               in your repo, commit, and push.
+        run: |
+          echo "${PREAMBLE}"
+          clang-format-${CLANG_VERSION} --version
+          echo "${SUGGESTION}"
+          exit 1
--- a/.github/workflows/doxygen.yml
+++ b/.github/workflows/doxygen.yml
@@ -10,11 +10,11 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  job:
+  documentation:
    runs-on: ubuntu-latest
    permissions:
      contents: write
-    container: rippleci/rippled-build-ubuntu:aaf5e3e
+    container: ghcr.io/xrplf/rippled-build-ubuntu:aaf5e3e
    steps:
      - name: checkout
        uses: actions/checkout@v4
--- a/.github/workflows/instrumentation.yml
+++ b/.github/workflows/instrumentation.yml
@@ -1,103 +0,0 @@
-name: instrumentation
-on:
-  pull_request:
-  push:
-    # If the branches list is ever changed, be sure to change it on all
-    # build/test jobs (nix, macos, windows, instrumentation)
-    branches:
-      # Always build the package branches
-      - develop
-      - release
-      - master
-      # Branches that opt-in to running
-      - 'ci/**'
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-
-  # NOTE we are not using dependencies built inside nix because nix is lagging
-  # with compiler versions. Instrumentation requires clang version 16 or later
-
-  instrumentation-build:
-    env:
-      CLANG_RELEASE: 16
-    strategy:
-      fail-fast: false
-    runs-on: [self-hosted, heavy]
-    container: debian:bookworm
-    steps:
-        - name: install prerequisites
-          env:
-            DEBIAN_FRONTEND: noninteractive
-          run: |
-            apt-get update
-            apt-get install --yes --no-install-recommends \
-              clang-${CLANG_RELEASE} clang++-${CLANG_RELEASE} \
-              python3-pip python-is-python3 make cmake git wget
-            apt-get clean
-            update-alternatives --install \
-              /usr/bin/clang clang /usr/bin/clang-${CLANG_RELEASE} 100 \
-              --slave /usr/bin/clang++ clang++ /usr/bin/clang++-${CLANG_RELEASE}
-            update-alternatives --auto clang
-            pip install --no-cache --break-system-packages "conan<2"
-
-        - name: checkout
-          uses: actions/checkout@v4
-
-        - name: prepare environment
-          run: |
-            mkdir ${GITHUB_WORKSPACE}/.build
-            echo "SOURCE_DIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
-            echo "BUILD_DIR=$GITHUB_WORKSPACE/.build" >> $GITHUB_ENV
-            echo "CC=/usr/bin/clang" >> $GITHUB_ENV
-            echo "CXX=/usr/bin/clang++" >> $GITHUB_ENV
-
-        - name: configure Conan
-          run: |
-            conan profile new --detect default
-            conan profile update settings.compiler=clang default
-            conan profile update settings.compiler.version=${CLANG_RELEASE} default
-            conan profile update settings.compiler.libcxx=libstdc++11 default
-            conan profile update settings.compiler.cppstd=20 default
-            conan profile update options.rocksdb=False default
-            conan profile update \
-              'conf.tools.build:compiler_executables={"c": "/usr/bin/clang", "cpp": "/usr/bin/clang++"}' default
-            conan profile update 'env.CXXFLAGS="-DBOOST_ASIO_DISABLE_CONCEPTS"' default
-            conan profile update 'conf.tools.build:cxxflags+=["-DBOOST_ASIO_DISABLE_CONCEPTS"]' default
-            conan export external/snappy snappy/1.1.10@
-            conan export external/soci soci/4.0.3@
-
-        - name: build dependencies
-          run: |
-            cd ${BUILD_DIR}
-            conan install ${SOURCE_DIR} \
-              --output-folder ${BUILD_DIR} \
-              --install-folder ${BUILD_DIR} \
-              --build missing \
-              --settings build_type=Debug
-
-        - name: build with instrumentation
-          run: |
-            cd ${BUILD_DIR}
-            cmake -S ${SOURCE_DIR} -B ${BUILD_DIR} \
-              -Dvoidstar=ON \
-              -Dtests=ON \
-              -Dxrpld=ON \
-              -DCMAKE_BUILD_TYPE=Debug \
-              -DSECP256K1_BUILD_BENCHMARK=OFF \
-              -DSECP256K1_BUILD_TESTS=OFF \
-              -DSECP256K1_BUILD_EXHAUSTIVE_TESTS=OFF \
-              -DCMAKE_TOOLCHAIN_FILE=${BUILD_DIR}/build/generators/conan_toolchain.cmake
-            cmake --build .  --parallel $(nproc)
-
-        - name: verify instrumentation enabled
-          run: |
-            cd ${BUILD_DIR}
-            ./rippled --version | grep libvoidstar
-
-        - name: run unit tests
-          run: |
-            cd ${BUILD_DIR}
-            ./rippled -u --unittest-jobs $(( $(nproc)/4 ))
--- a/.github/workflows/levelization.yml
+++ b/.github/workflows/levelization.yml
@@ -1,49 +1,53 @@
 name: levelization

-on: [push, pull_request]
+on:
+  push:
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]

 jobs:
  check:
+    if: ${{ github.event_name == 'push' || github.event.pull_request.draft != true || contains(github.event.pull_request.labels.*.name, 'DraftRunCI') }}
    runs-on: ubuntu-latest
    env:
      CLANG_VERSION: 10
    steps:
-    - uses: actions/checkout@v4
-    - name: Check levelization
-      run: Builds/levelization/levelization.sh
-    - name: Check for differences
-      id: assert
-      run: |
-        set -o pipefail
-        git diff --exit-code | tee "levelization.patch"
-    - name: Upload patch
-      if: failure() && steps.assert.outcome == 'failure'
-      uses: actions/upload-artifact@v3
-      continue-on-error: true
-      with:
-        name: levelization.patch
-        if-no-files-found: ignore
-        path: levelization.patch
-    - name: What happened?
-      if: failure() && steps.assert.outcome == 'failure'
-      env:
-        MESSAGE: |
-          If you are reading this, you are looking at a failed Github
-          Actions job. That means you changed the dependency relationships
-          between the modules in rippled. That may be an improvement or a
-          regression. This check doesn't judge.
+      - uses: actions/checkout@v4
+      - name: Check levelization
+        run: Builds/levelization/levelization.sh
+      - name: Check for differences
+        id: assert
+        run: |
+          set -o pipefail
+          git diff --exit-code | tee "levelization.patch"
+      - name: Upload patch
+        if: failure() && steps.assert.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        continue-on-error: true
+        with:
+          name: levelization.patch
+          if-no-files-found: ignore
+          path: levelization.patch
+      - name: What happened?
+        if: failure() && steps.assert.outcome == 'failure'
+        env:
+          MESSAGE: |
+            If you are reading this, you are looking at a failed Github
+            Actions job. That means you changed the dependency relationships
+            between the modules in rippled. That may be an improvement or a
+            regression. This check doesn't judge.

-          A rule of thumb, though, is that if your changes caused
-          something to be removed from loops.txt, that's probably an
-          improvement. If something was added, it's probably a regression.
+            A rule of thumb, though, is that if your changes caused
+            something to be removed from loops.txt, that's probably an
+            improvement. If something was added, it's probably a regression.

-          To fix it, you can do one of two things:
-          1. Download and apply the patch generated as an artifact of this
-             job to your repo, commit, and push.
-          2. Run './Builds/levelization/levelization.sh' in your repo,
-             commit, and push.
+            To fix it, you can do one of two things:
+            1. Download and apply the patch generated as an artifact of this
+               job to your repo, commit, and push.
+            2. Run './Builds/levelization/levelization.sh' in your repo,
+               commit, and push.

-          See Builds/levelization/README.md for more info.
-      run: |
-        echo "${MESSAGE}"
-        exit 1
+            See Builds/levelization/README.md for more info.
+        run: |
+          echo "${MESSAGE}"
+          exit 1
--- a/.github/workflows/libxrpl.yml
+++ b/.github/workflows/libxrpl.yml
@@ -1,6 +1,6 @@
 name: Check libXRPL compatibility with Clio
 env:
-  CONAN_URL: http://18.143.149.228:8081/artifactory/api/conan/conan-non-prod
+  CONAN_URL: http://18.143.149.228:8081/artifactory/api/conan/dev
  CONAN_LOGIN_USERNAME_RIPPLE: ${{ secrets.CONAN_USERNAME }}
  CONAN_PASSWORD_RIPPLE: ${{ secrets.CONAN_TOKEN }}
 on:
@@ -8,19 +8,21 @@ on:
    paths:
      - 'src/libxrpl/protocol/BuildInfo.cpp'
      - '.github/workflows/libxrpl.yml'
+    types: [opened, reopened, synchronize, ready_for_review]
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

 jobs:
  publish:
+    if: ${{ github.event_name == 'push' || github.event.pull_request.draft != true || contains(github.event.pull_request.labels.*.name, 'DraftRunCI') }}
    name: Publish libXRPL
    outputs:
      outcome: ${{ steps.upload.outputs.outcome }}
      version: ${{ steps.version.outputs.version }}
      channel: ${{ steps.channel.outputs.channel }}
    runs-on: [self-hosted, heavy]
-    container: rippleci/rippled-build-ubuntu:aaf5e3e
+    container: ghcr.io/xrplf/rippled-build-ubuntu:aaf5e3e
    steps:
      - name: Wait for essential checks to succeed
        uses: lewagon/wait-on-check-action@v1.3.4
@@ -85,4 +87,5 @@ jobs:
        run: |
          gh api --method POST -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" \
          /repos/xrplf/clio/dispatches -f "event_type=check_libxrpl" \
-          -F "client_payload[version]=${{ needs.publish.outputs.version }}@${{ needs.publish.outputs.channel }}"
+          -F "client_payload[version]=${{ needs.publish.outputs.version }}@${{ needs.publish.outputs.channel }}" \
+          -F "client_payload[pr]=${{ github.event.pull_request.number }}"
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -1,6 +1,7 @@
 name: macos
 on:
  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
  push:
    # If the branches list is ever changed, be sure to change it on all
    # build/test jobs (nix, macos, windows, instrumentation)
@@ -18,6 +19,7 @@ concurrency:
 jobs:

  test:
+    if: ${{ github.event_name == 'push' || github.event.pull_request.draft != true || contains(github.event.pull_request.labels.*.name, 'DraftRunCI') }}
    strategy:
      matrix:
        platform:
@@ -41,6 +43,24 @@ jobs:
      - name: install Ninja
        if: matrix.generator == 'Ninja'
        run: brew install ninja
+      - name: install python
+        run: | 
+          if which python > /dev/null 2>&1; then
+              echo "Python executable exists"
+          else
+              brew install python@3.13
+              ln -s /opt/homebrew/bin/python3 /opt/homebrew/bin/python
+          fi
+      - name: install cmake
+        run: |
+          if which cmake > /dev/null 2>&1; then
+              echo "cmake executable exists"
+          else
+              brew install cmake
+          fi
+      - name: install nproc
+        run: |
+          brew install coreutils
      - name: check environment
        run: |
          env | sort
@@ -48,11 +68,16 @@ jobs:
          python --version
          conan --version
          cmake --version
+          nproc --version
+          echo -n "nproc returns: "
+          nproc
+          system_profiler SPHardwareDataType
+          sysctl -n hw.logicalcpu
+          clang --version
      - name: configure Conan
        run : |
          conan profile new default --detect || true
          conan profile update settings.compiler.cppstd=20 default
-          conan profile update 'conf.tools.build:cxxflags+=["-DBOOST_ASIO_DISABLE_CONCEPTS"]' default
      - name: build dependencies
        uses: ./.github/actions/dependencies
        env:
@@ -66,6 +91,9 @@ jobs:
        with:
          generator: ${{ matrix.generator }}
          configuration: ${{ matrix.configuration }}
+          cmake-args: "-Dassert=TRUE -Dwerr=TRUE ${{ matrix.cmake-args }}"
      - name: test
        run: |
-          ${build_dir}/rippled --unittest
+          n=$(nproc)
+          echo "Using $n test jobs"
+          ${build_dir}/rippled --unittest --unittest-jobs $n
--- a/.github/workflows/missing-commits.yml
+++ b/.github/workflows/missing-commits.yml
@@ -0,0 +1,60 @@
+name: missing-commits
+
+on:
+  push:
+    branches:
+      # Only check that the branches are up to date when updating the
+      # relevant branches.
+      - develop
+      - release
+
+jobs:
+  up_to_date:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Check for missing commits
+      id: commits
+      env:
+        SUGGESTION: |
+
+          If you are reading this, then the commits indicated above are
+          missing from "develop" and/or "release". Do a reverse-merge
+          as soon as possible. See CONTRIBUTING.md for instructions.
+      run: |
+        set -o pipefail
+        # Branches ordered by how "canonical" they are. Every commit in
+        # one branch should be in all the branches behind it
+        order=( master release develop )
+        branches=()
+        for branch in "${order[@]}"
+        do
+          # Check that the branches exist so that this job will work on
+          # forked repos, which don't necessarily have master and
+          # release branches.
+          if git ls-remote --exit-code --heads origin \
+            refs/heads/${branch} > /dev/null
+          then
+            branches+=( origin/${branch} )
+          fi
+        done
+
+        prior=()
+        for branch in "${branches[@]}"
+        do
+          if [[ ${#prior[@]} -ne 0 ]]
+          then
+            echo "Checking ${prior[@]} for commits missing from ${branch}"
+            git log --oneline --no-merges "${prior[@]}" \
+              ^$branch | tee -a "missing-commits.txt"
+            echo
+          fi
+          prior+=( "${branch}" )
+        done
+        if [[ $( cat missing-commits.txt | wc -l ) -ne 0 ]]
+        then
+          echo "${SUGGESTION}"
+          exit 1
+        fi
--- a/.github/workflows/nix.yml
+++ b/.github/workflows/nix.yml
@@ -1,23 +1,24 @@
 name: nix
 on:
  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
  push:
    # If the branches list is ever changed, be sure to change it on all
-    # build/test jobs (nix, macos, windows, instrumentation)
+    # build/test jobs (nix, macos, windows)
    branches:
      # Always build the package branches
      - develop
      - release
      - master
      # Branches that opt-in to running
-      - 'ci/**'
+      - "ci/**"
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

-# This workflow has two job matrixes.
-# They can be considered phases because the second matrix ("test")
-# depends on the first ("dependencies").
+# This workflow has multiple job matrixes.
+# They can be considered phases because most of the matrices ("test",
+# "coverage", "conan", ) depend on the first ("dependencies").
 #
 # The first phase has a job in the matrix for each combination of
 # variables that affects dependency ABI:
@@ -30,13 +31,16 @@ concurrency:
 # to hold the binaries if they are built locally.
 # We must use the "{upload,download}-artifact" actions instead.
 #
-# The second phase has a job in the matrix for each test configuration.
-# It installs dependency binaries from the cache, whichever was used,
-# and builds and tests rippled.
+# The remaining phases have a job in the matrix for each test
+# configuration. They install dependency binaries from the cache,
+# whichever was used, and build and test rippled.
+#
+# "instrumentation" is independent, but is included here because it also
+# builds on linux in the same "on:" conditions.

 jobs:
-
  dependencies:
+    if: ${{ github.event_name == 'push' || github.event.pull_request.draft != true || contains(github.event.pull_request.labels.*.name, 'DraftRunCI') }}
    strategy:
      fail-fast: false
      matrix:
@@ -60,15 +64,20 @@ jobs:
              cc: /usr/bin/clang-14
              cxx: /usr/bin/clang++-14
    runs-on: [self-hosted, heavy]
-    container: rippleci/rippled-build-ubuntu:aaf5e3e
+    container: ghcr.io/xrplf/rippled-build-ubuntu:aaf5e3e
    env:
      build_dir: .build
    steps:
+      - name: upgrade conan
+        run: |
+          pip install --upgrade "conan<2"
      - name: checkout
        uses: actions/checkout@v4
      - name: check environment
        run: |
          echo ${PATH} | tr ':' '\n'
+          lsb_release -a || true
+          ${{ matrix.profile.cc }} --version
          conan --version
          cmake --version
          env | sort
@@ -94,13 +103,12 @@ jobs:
        with:
          configuration: ${{ matrix.configuration }}
      - name: upload archive
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.platform }}-${{ matrix.compiler }}-${{ matrix.configuration }}
          path: conan.tar
          if-no-files-found: error

-
  test:
    strategy:
      fail-fast: false
@@ -118,12 +126,15 @@ jobs:
          - "-Dunity=ON"
    needs: dependencies
    runs-on: [self-hosted, heavy]
-    container: rippleci/rippled-build-ubuntu:aaf5e3e
+    container: ghcr.io/xrplf/rippled-build-ubuntu:aaf5e3e
    env:
      build_dir: .build
    steps:
+      - name: upgrade conan
+        run: |
+          pip install --upgrade "conan<2"
      - name: download cache
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: ${{ matrix.platform }}-${{ matrix.compiler }}-${{ matrix.configuration }}
      - name: extract cache
@@ -149,11 +160,64 @@ jobs:
        with:
          generator: Ninja
          configuration: ${{ matrix.configuration }}
-          cmake-args: ${{ matrix.cmake-args }}
+          cmake-args: "-Dassert=TRUE -Dwerr=TRUE ${{ matrix.cmake-args }}"
      - name: test
        run: |
          ${build_dir}/rippled --unittest --unittest-jobs $(nproc)

+  reference-fee-test:
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - linux
+        compiler:
+          - gcc
+        configuration:
+          - Debug
+        cmake-args:
+          - "-DUNIT_TEST_REFERENCE_FEE=200"
+          - "-DUNIT_TEST_REFERENCE_FEE=1000"
+    needs: dependencies
+    runs-on: [self-hosted, heavy]
+    container: ghcr.io/xrplf/rippled-build-ubuntu:aaf5e3e
+    env:
+      build_dir: .build
+    steps:
+      - name: upgrade conan
+        run: |
+          pip install --upgrade "conan<2"
+      - name: download cache
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ matrix.platform }}-${{ matrix.compiler }}-${{ matrix.configuration }}
+      - name: extract cache
+        run: |
+          mkdir -p ~/.conan
+          tar -xzf conan.tar -C ~/.conan
+      - name: check environment
+        run: |
+          env | sort
+          echo ${PATH} | tr ':' '\n'
+          conan --version
+          cmake --version
+      - name: checkout
+        uses: actions/checkout@v4
+      - name: dependencies
+        uses: ./.github/actions/dependencies
+        env:
+          CONAN_URL: http://18.143.149.228:8081/artifactory/api/conan/conan-non-prod
+        with:
+          configuration: ${{ matrix.configuration }}
+      - name: build
+        uses: ./.github/actions/build
+        with:
+          generator: Ninja
+          configuration: ${{ matrix.configuration }}
+          cmake-args: "-Dassert=TRUE -Dwerr=TRUE ${{ matrix.cmake-args }}"
+      - name: test
+        run: |
+          ${build_dir}/rippled --unittest --unittest-jobs $(nproc)

  coverage:
    strategy:
@@ -167,12 +231,15 @@ jobs:
          - Debug
    needs: dependencies
    runs-on: [self-hosted, heavy]
-    container: rippleci/rippled-build-ubuntu:aaf5e3e
+    container: ghcr.io/xrplf/rippled-build-ubuntu:aaf5e3e
    env:
      build_dir: .build
    steps:
+      - name: upgrade conan
+        run: |
+          pip install --upgrade "conan<2"
      - name: download cache
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: ${{ matrix.platform }}-${{ matrix.compiler }}-${{ matrix.configuration }}
      - name: extract cache
@@ -180,7 +247,7 @@ jobs:
          mkdir -p ~/.conan
          tar -xzf conan.tar -C ~/.conan
      - name: install gcovr
-        run: pip install "gcovr>=7,<8"
+        run: pip install "gcovr>=7,<9"
      - name: check environment
        run: |
          echo ${PATH} | tr ':' '\n'
@@ -203,6 +270,8 @@ jobs:
          generator: Ninja
          configuration: ${{ matrix.configuration }}
          cmake-args: >-
+            -Dassert=TRUE
+            -Dwerr=TRUE
            -Dcoverage=ON
            -Dcoverage_format=xml
            -DCODE_COVERAGE_VERBOSE=ON
@@ -214,7 +283,7 @@ jobs:
        run: |
          mv "${build_dir}/coverage.xml" ./
      - name: archive coverage report
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: coverage.xml
          path: coverage.xml
@@ -236,13 +305,16 @@ jobs:
  conan:
    needs: dependencies
    runs-on: [self-hosted, heavy]
-    container: rippleci/rippled-build-ubuntu:aaf5e3e
+    container: ghcr.io/xrplf/rippled-build-ubuntu:aaf5e3e
    env:
      build_dir: .build
      configuration: Release
    steps:
+      - name: upgrade conan
+        run: |
+          pip install --upgrade "conan<2"
      - name: download cache
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: linux-gcc-${{ env.configuration }}
      - name: extract cache
@@ -272,7 +344,7 @@ jobs:
          echo "reference=${reference}" >> "${GITHUB_ENV}"
      - name: build
        run: |
-          cd examples/example
+          cd tests/conan
          mkdir ${build_dir}
          cd ${build_dir}
          conan install .. --output-folder . \
@@ -282,3 +354,90 @@ jobs:
            -DCMAKE_BUILD_TYPE=${configuration}
          cmake --build .
          ./example | grep '^[[:digit:]]\+\.[[:digit:]]\+\.[[:digit:]]\+'
+
+  # NOTE we are not using dependencies built above because it lags with
+  # compiler versions. Instrumentation requires clang version 16 or
+  # later
+
+  instrumentation-build:
+    if: ${{ github.event_name == 'push' || github.event.pull_request.draft != true || contains(github.event.pull_request.labels.*.name, 'DraftRunCI') }}
+    env:
+      CLANG_RELEASE: 16
+    strategy:
+      fail-fast: false
+    runs-on: [self-hosted, heavy]
+    container: debian:bookworm
+    steps:
+        - name: install prerequisites
+          env:
+            DEBIAN_FRONTEND: noninteractive
+          run: |
+            apt-get update
+            apt-get install --yes --no-install-recommends \
+              clang-${CLANG_RELEASE} clang++-${CLANG_RELEASE} \
+              python3-pip python-is-python3 make cmake git wget
+            apt-get clean
+            update-alternatives --install \
+              /usr/bin/clang clang /usr/bin/clang-${CLANG_RELEASE} 100 \
+              --slave /usr/bin/clang++ clang++ /usr/bin/clang++-${CLANG_RELEASE}
+            update-alternatives --auto clang
+            pip install --no-cache --break-system-packages "conan<2"
+
+        - name: checkout
+          uses: actions/checkout@v4
+
+        - name: prepare environment
+          run: |
+            mkdir ${GITHUB_WORKSPACE}/.build
+            echo "SOURCE_DIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+            echo "BUILD_DIR=$GITHUB_WORKSPACE/.build" >> $GITHUB_ENV
+            echo "CC=/usr/bin/clang" >> $GITHUB_ENV
+            echo "CXX=/usr/bin/clang++" >> $GITHUB_ENV
+
+        - name: configure Conan
+          run: |
+            conan profile new --detect default
+            conan profile update settings.compiler=clang default
+            conan profile update settings.compiler.version=${CLANG_RELEASE} default
+            conan profile update settings.compiler.libcxx=libstdc++11 default
+            conan profile update settings.compiler.cppstd=20 default
+            conan profile update options.rocksdb=False default
+            conan profile update \
+              'conf.tools.build:compiler_executables={"c": "/usr/bin/clang", "cpp": "/usr/bin/clang++"}' default
+            conan profile update 'env.CXXFLAGS="-DBOOST_ASIO_DISABLE_CONCEPTS"' default
+            conan profile update 'conf.tools.build:cxxflags+=["-DBOOST_ASIO_DISABLE_CONCEPTS"]' default
+            conan export external/snappy snappy/1.1.10@
+            conan export external/soci soci/4.0.3@
+
+        - name: build dependencies
+          run: |
+            cd ${BUILD_DIR}
+            conan install ${SOURCE_DIR} \
+              --output-folder ${BUILD_DIR} \
+              --install-folder ${BUILD_DIR} \
+              --build missing \
+              --settings build_type=Debug
+
+        - name: build with instrumentation
+          run: |
+            cd ${BUILD_DIR}
+            cmake -S ${SOURCE_DIR} -B ${BUILD_DIR} \
+              -Dvoidstar=ON \
+              -Dtests=ON \
+              -Dxrpld=ON \
+              -DCMAKE_BUILD_TYPE=Debug \
+              -DSECP256K1_BUILD_BENCHMARK=OFF \
+              -DSECP256K1_BUILD_TESTS=OFF \
+              -DSECP256K1_BUILD_EXHAUSTIVE_TESTS=OFF \
+              -DCMAKE_TOOLCHAIN_FILE=${BUILD_DIR}/build/generators/conan_toolchain.cmake
+            cmake --build .  --parallel $(nproc)
+
+        - name: verify instrumentation enabled
+          run: |
+            cd ${BUILD_DIR}
+            ./rippled --version | grep libvoidstar
+
+        - name: run unit tests
+          run: |
+            cd ${BUILD_DIR}
+            ./rippled -u --unittest-jobs $(( $(nproc)/4 ))
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -2,6 +2,7 @@ name: windows

 on:
  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
  push:
    # If the branches list is ever changed, be sure to change it on all
    # build/test jobs (nix, macos, windows, instrumentation)
@@ -21,19 +22,22 @@ concurrency:
 jobs:

  test:
+    if: ${{ github.event_name == 'push' || github.event.pull_request.draft != true || contains(github.event.pull_request.labels.*.name, 'DraftRunCI') }}
    strategy:
      fail-fast: false
      matrix:
-        generator:
-          - Visual Studio 16 2019
+        version:
+          - generator: Visual Studio 17 2022
+            runs-on: windows-2022
        configuration:
-          - Release
-          # Github hosted runners tend to hang when running Debug unit tests.
-          # Instead of trying to work around it, disable the Debug job until
-          # something beefier (i.e. a heavy self-hosted runner) becomes
-          # available.
-          # - Debug
-    runs-on: windows-2019
+          - type: Release
+            tests: true
+          - type: Debug
+            # Skip running unit tests on debug builds, because they
+            # take an unreasonable amount of time
+            tests: false
+            runtime: d
+    runs-on: ${{ matrix.version.runs-on }}
    env:
      build_dir: .build
    steps:
@@ -68,7 +72,9 @@ jobs:
        run: |
          conan profile new default --detect
          conan profile update settings.compiler.cppstd=20 default
-          conan profile update settings.compiler.runtime=MT${{ matrix.configuration == 'Debug' && 'd' || '' }} default
+          conan profile update \
+            settings.compiler.runtime=MT${{ matrix.configuration.runtime }} \
+            default
      - name: build dependencies
        uses: ./.github/actions/dependencies
        env:
@@ -76,16 +82,18 @@ jobs:
          CONAN_LOGIN_USERNAME_RIPPLE: ${{ secrets.CONAN_USERNAME }}
          CONAN_PASSWORD_RIPPLE: ${{ secrets.CONAN_TOKEN }}
        with:
-          configuration: ${{ matrix.configuration }}
+          configuration: ${{ matrix.configuration.type }}
      - name: build
        uses: ./.github/actions/build
        with:
-          generator: '${{ matrix.generator }}'
-          configuration: ${{ matrix.configuration }}
+          generator: '${{ matrix.version.generator }}'
+          configuration: ${{ matrix.configuration.type }}
          # Hard code for now. Move to the matrix if varied options are needed
-          cmake-args: '-Dassert=ON -Dreporting=OFF -Dunity=ON'
+          cmake-args: '-Dassert=TRUE -Dwerr=TRUE -Dreporting=OFF -Dunity=ON'
          cmake-target: install
      - name: test
        shell: bash
+        if: ${{ matrix.configuration.tests }}
        run: |
-          ${build_dir}/${{ matrix.configuration }}/rippled --unittest --unittest-jobs $(nproc)
+          ${build_dir}/${{ matrix.configuration.type }}/rippled --unittest \
+              --unittest-jobs $(nproc)
--- a/API-CHANGELOG.md
+++ b/API-CHANGELOG.md
@@ -83,25 +83,49 @@ The [commandline](https://xrpl.org/docs/references/http-websocket-apis/api-conve

 The `network_id` field was added in the `server_info` response in version 1.5.0 (2019), but it is not returned in [reporting mode](https://xrpl.org/rippled-server-modes.html#reporting-mode). However, use of reporting mode is now discouraged, in favor of using [Clio](https://github.com/XRPLF/clio) instead.

+## XRP Ledger server version 2.5.0
+
+As of 2025-04-04, version 2.5.0 is in development. You can use a pre-release version by building from source or [using the `nightly` package](https://xrpl.org/docs/infrastructure/installation/install-rippled-on-ubuntu).
+
+### Additions and bugfixes in 2.5.0
+
+- `channel_authorize`: If `signing_support` is not enabled in the config, the RPC is disabled.
+
 ## XRP Ledger server version 2.4.0

-### Addition in 2.4
+[Version 2.4.0](https://github.com/XRPLF/rippled/releases/tag/2.4.0) was released on March 4, 2025.
+
+### Additions and bugfixes in 2.4.0

 - `ledger_entry`: `state` is added an alias for `ripple_state`.
+- `ledger_entry`: Enables case-insensitive filtering by canonical name in addition to case-sensitive filtering by RPC name.
+- `validators`: Added new field `validator_list_threshold` in response.
+- `simulate`: A new RPC that executes a [dry run of a transaction submission](https://github.com/XRPLF/XRPL-Standards/tree/master/XLS-0069d-simulate#2-rpc-simulate)
+- Signing methods autofill fees better and properly handle transactions that don't have a base fee, and will also autofill the `NetworkID` field.

 ## XRP Ledger server version 2.3.0

-### Breaking change in 2.3
+[Version 2.3.0](https://github.com/XRPLF/rippled/releases/tag/2.3.0) was released on Nov 25, 2024.
+
+### Breaking changes in 2.3.0

 - `book_changes`: If the requested ledger version is not available on this node, a `ledgerNotFound` error is returned and the node does not attempt to acquire the ledger from the p2p network (as with other non-admin RPCs).

 Admins can still attempt to retrieve old ledgers with the `ledger_request` RPC.

-### Addition in 2.3
+### Additions and bugfixes in 2.3.0

 - `book_changes`: Returns a `validated` field in its response, which was missing in prior versions.

-The following additions are non-breaking (because they are purely additive).
+## XRP Ledger server version 2.2.0
+
+[Version 2.2.0](https://github.com/XRPLF/rippled/releases/tag/2.2.0) was released on Jun 5, 2024. The following additions are non-breaking (because they are purely additive):
+
+- The `feature` method now has a non-admin mode for users. (It was previously only available to admin connections.) The method returns an updated list of amendments, including their names and other information. ([#4781](https://github.com/XRPLF/rippled/pull/4781))
+
+## XRP Ledger server version 2.0.0
+
+[Version 2.0.0](https://github.com/XRPLF/rippled/releases/tag/2.0.0) was released on Jan 9, 2024. The following additions are non-breaking (because they are purely additive):

 - `server_definitions`: A new RPC that generates a `definitions.json`-like output that can be used in XRPL libraries.
 - In `Payment` transactions, `DeliverMax` has been added. This is a replacement for the `Amount` field, which should not be used. Typically, the `delivered_amount` (in transaction metadata) should be used. To ease the transition, `DeliverMax` is present regardless of API version, since adding a field is non-breaking.
--- a/BUILD.md
+++ b/BUILD.md
@@ -178,9 +178,9 @@ It does not override paths to dependencies when building with Visual Studio.

   ```
   # Conan 1.x
-   conan export external/rocksdb rocksdb/6.29.5@
+   conan export external/rocksdb rocksdb/9.7.3@
   # Conan 2.x
-   conan export --version 6.29.5 external/rocksdb
+   conan export --version 9.7.3 external/rocksdb
   ```

 Export our [Conan recipe for SOCI](./external/soci).
@@ -222,13 +222,15 @@ It fixes some source files to add missing `#include`s.
   the `install-folder` or `-if` option to every `conan install` command
   in the next step.

-2. Generate CMake files for every configuration you want to build. 
+2. Use conan to generate CMake files for every configuration you want to build:

    ```
    conan install .. --output-folder . --build missing --settings build_type=Release
    conan install .. --output-folder . --build missing --settings build_type=Debug
    ```

+    To build Debug, in the next step, be sure to set `-DCMAKE_BUILD_TYPE=Debug`
+
    For a single-configuration generator, e.g. `Unix Makefiles` or `Ninja`,
    you only need to run this command once.
    For a multi-configuration generator, e.g. `Visual Studio`, you may want to
@@ -258,13 +260,16 @@ It fixes some source files to add missing `#include`s.

    Single-config generators:

+    Pass the CMake variable [`CMAKE_BUILD_TYPE`][build_type]
+    and make sure it matches the one of the `build_type` settings
+    you chose in the previous step.
+
+    For example, to build Debug, in the next command, replace "Release" with "Debug"
+
    ```
    cmake -DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake -DCMAKE_BUILD_TYPE=Release -Dxrpld=ON -Dtests=ON ..
    ```

-    Pass the CMake variable [`CMAKE_BUILD_TYPE`][build_type]
-    and make sure it matches the `build_type` setting you chose in the previous
-    step.

    Multi-config generators:

@@ -274,7 +279,7 @@ It fixes some source files to add missing `#include`s.

    **Note:** You can pass build options for `rippled` in this step.

-4. Build `rippled`.
+5. Build `rippled`.

   For a single-configuration generator, it will build whatever configuration
   you passed for `CMAKE_BUILD_TYPE`. For a multi-configuration generator,
@@ -283,7 +288,7 @@ It fixes some source files to add missing `#include`s.
   Single-config generators:

   ```
-   cmake --build .
+   cmake --build . -j $(nproc)
   ```

   Multi-config generators:
@@ -293,7 +298,7 @@ It fixes some source files to add missing `#include`s.
   cmake --build . --config Debug
   ```

-5. Test rippled.
+6. Test rippled.

   Single-config generators:

@@ -403,6 +408,23 @@ After any updates or changes to dependencies, you may need to do the following:
 4. Re-run [conan install](#build-and-test).


+### 'protobuf/port_def.inc' file not found
+
+If `cmake --build .` results in an error due to a missing a protobuf file, then you might have generated CMake files for a different `build_type` than the `CMAKE_BUILD_TYPE` you passed to conan.
+
+```
+/rippled/.build/pb-xrpl.libpb/xrpl/proto/ripple.pb.h:10:10: fatal error: 'google/protobuf/port_def.inc' file not found
+   10 | #include <google/protobuf/port_def.inc>
+      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+1 error generated.
+```
+
+For example, if you want to build Debug:
+
+1. For conan install, pass `--settings build_type=Debug`
+2. For cmake, pass `-DCMAKE_BUILD_TYPE=Debug`
+
+
 ### no std::result_of

 If your compiler version is recent enough to have removed `std::result_of` as
--- a/Builds/levelization/levelization.sh
+++ b/Builds/levelization/levelization.sh
@@ -21,7 +21,7 @@ mkdir results
 includes="$( pwd )/results/rawincludes.txt"
 pushd ../..
 echo Raw includes:
-grep -r '#include.*/.*\.h' include src | \
+grep -r '^[ ]*#include.*/.*\.h' include src | \
    grep -v boost | tee ${includes}
 popd
 pushd results
--- a/Builds/levelization/results/loops.txt
+++ b/Builds/levelization/results/loops.txt
@@ -14,10 +14,10 @@ Loop: xrpld.app xrpld.net
  xrpld.app > xrpld.net

 Loop: xrpld.app xrpld.overlay
-  xrpld.overlay == xrpld.app
+  xrpld.overlay > xrpld.app

 Loop: xrpld.app xrpld.peerfinder
-  xrpld.app > xrpld.peerfinder
+  xrpld.peerfinder ~= xrpld.app

 Loop: xrpld.app xrpld.rpc
  xrpld.rpc > xrpld.app
--- a/Builds/levelization/results/ordering.txt
+++ b/Builds/levelization/results/ordering.txt
@@ -6,6 +6,7 @@ libxrpl.protocol > xrpl.basics
 libxrpl.protocol > xrpl.json
 libxrpl.protocol > xrpl.protocol
 libxrpl.resource > xrpl.basics
+libxrpl.resource > xrpl.json
 libxrpl.resource > xrpl.resource
 libxrpl.server > xrpl.basics
 libxrpl.server > xrpl.json
@@ -19,6 +20,7 @@ test.app > xrpl.basics
 test.app > xrpld.app
 test.app > xrpld.core
 test.app > xrpld.ledger
+test.app > xrpld.nodestore
 test.app > xrpld.overlay
 test.app > xrpld.rpc
 test.app > xrpl.json
@@ -41,6 +43,7 @@ test.consensus > xrpl.basics
 test.consensus > xrpld.app
 test.consensus > xrpld.consensus
 test.consensus > xrpld.ledger
+test.consensus > xrpl.json
 test.core > test.jtx
 test.core > test.toplevel
 test.core > test.unit_test
@@ -57,7 +60,6 @@ test.json > test.jtx
 test.json > xrpl.json
 test.jtx > xrpl.basics
 test.jtx > xrpld.app
-test.jtx > xrpld.consensus
 test.jtx > xrpld.core
 test.jtx > xrpld.ledger
 test.jtx > xrpld.net
@@ -81,6 +83,7 @@ test.nodestore > xrpld.core
 test.nodestore > xrpld.nodestore
 test.nodestore > xrpld.unity
 test.overlay > test.jtx
+test.overlay > test.toplevel
 test.overlay > test.unit_test
 test.overlay > xrpl.basics
 test.overlay > xrpld.app
@@ -156,7 +159,6 @@ xrpld.core > xrpl.basics
 xrpld.core > xrpl.json
 xrpld.core > xrpl.protocol
 xrpld.ledger > xrpl.basics
-xrpld.ledger > xrpld.core
 xrpld.ledger > xrpl.json
 xrpld.ledger > xrpl.protocol
 xrpld.net > xrpl.basics
@@ -181,7 +183,6 @@ xrpld.peerfinder > xrpld.core
 xrpld.peerfinder > xrpl.protocol
 xrpld.perflog > xrpl.basics
 xrpld.perflog > xrpl.json
-xrpld.perflog > xrpl.protocol
 xrpld.rpc > xrpl.basics
 xrpld.rpc > xrpld.core
 xrpld.rpc > xrpld.ledger
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,16 +16,36 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    # GCC-specific fixes
+    add_compile_options(-Wno-unknown-pragmas -Wno-subobject-linkage)
+    # -Wno-subobject-linkage can be removed when we upgrade GCC version to at least 13.3
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    # Clang-specific fixes
+    add_compile_options(-Wno-unknown-warning-option) # Ignore unknown warning options
+elseif(MSVC)
+    # MSVC-specific fixes
+    add_compile_options(/wd4068) # Ignore unknown pragmas
+endif()
+
 # make GIT_COMMIT_HASH define available to all sources
 find_package(Git)
 if(Git_FOUND)
-    execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${CMAKE_CURRENT_SOURCE_DIR}/.git describe --always --abbrev=40
+    execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${CMAKE_CURRENT_SOURCE_DIR}/.git rev-parse HEAD
        OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE gch)
    if(gch)
        set(GIT_COMMIT_HASH "${gch}")
        message(STATUS gch: ${GIT_COMMIT_HASH})
        add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
    endif()
+
+    execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${CMAKE_CURRENT_SOURCE_DIR}/.git rev-parse --abbrev-ref HEAD
+        OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE gb)
+    if(gb)
+        set(GIT_BRANCH "${gb}")
+        message(STATUS gb: ${GIT_BRANCH})
+        add_definitions(-DGIT_BRANCH="${GIT_BRANCH}")
+    endif()
 endif() #git

 if(thread_safety_analysis)
@@ -74,6 +94,7 @@ add_subdirectory(external/secp256k1)
 add_library(secp256k1::secp256k1 ALIAS secp256k1)
 add_subdirectory(external/ed25519-donna)
 add_subdirectory(external/antithesis-sdk)
+add_subdirectory(external/blake3)
 find_package(gRPC REQUIRED)
 find_package(lz4 REQUIRED)
 # Target names with :: are not allowed in a generator expression.
@@ -104,6 +125,7 @@ target_link_libraries(ripple_libs INTERFACE
  secp256k1::secp256k1
  soci::soci
  SQLite::SQLite3
+  blake3
 )

 # Work around changes to Conan recipe for now.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -5,15 +5,12 @@ XRPL.
 # Contributing

 We assume you are familiar with the general practice of [making
-contributions on GitHub][1]. This file includes only special
+contributions on GitHub][contrib]. This file includes only special
 instructions specific to this project.


 ## Before you start

-In general, contributions should be developed in your personal
-[fork](https://github.com/XRPLF/rippled/fork).
-
 The following branches exist in the main project repository:

 - `develop`: The latest set of unreleased features, and the most common
@@ -26,9 +23,20 @@ The tip of each branch must be signed. In order for GitHub to sign a
 squashed commit that it builds from your pull request, GitHub must know
 your verifying key. Please set up [signature verification][signing].

-[rippled]: https://github.com/XRPLF/rippled
-[signing]:
-    https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification
+In general, external contributions should be developed in your personal
+[fork][forking]. Contributions from developers with write permissions
+should be done in [the main repository][rippled] in a branch with
+a permitted prefix. Permitted prefixes are:
+* XLS-[a-zA-Z0-9]+/.+
+  * e.g. XLS-0033d/mpt-clarify-STEitherAmount
+* [GitHub username]/.+
+  * e.g. JoelKatz/fix-rpc-webhook-queue
+* [Organization name]/.+
+  * e.g. ripple/antithesis
+
+Regardless of where the branch is created, please open a *draft* pull
+request as soon as possible after pushing the branch to Github, to
+increase visibility, and ease feedback during the development process.


 ## Major contributions
@@ -49,6 +57,7 @@ author delegates that responsibility to others.


 ## Before making a pull request
+(Or marking a draft pull request as ready.)

 Changes that alter transaction processing must be guarded by an
 [Amendment](https://xrpl.org/amendments.html).
@@ -57,18 +66,19 @@ Amendment.

 Ensure that your code compiles according to the build instructions in
 [`BUILD.md`](./BUILD.md).
-If you create new source files, they must go under `src/ripple`.
-You will need to add them to one of the
-[source lists](./Builds/CMake/RippledCore.cmake) in CMake.

 Please write tests for your code.
-If you create new test source files, they must go under `src/test`.
-You will need to add them to one of the
-[source lists](./Builds/CMake/RippledCore.cmake) in CMake.
 If your test can be run offline, in under 60 seconds, then it can be an
 automatic test run by `rippled --unittest`.
 Otherwise, it must be a manual test.

+If you create new source files, they must be organized as follows:
+* If the files are in any of the `libxrpl` modules, the headers (`.h`) must go
+  under `include/xrpl`, and source (`.cpp`) files must go under
+  `src/libxrpl`.
+* All other non-test files must go under `src/xrpld`.
+* All test source files must go under `src/test`.
+
 The source must be formatted according to the style guide below.

 Header includes must be [levelized](./Builds/levelization).
@@ -97,6 +107,19 @@ Refer to
 ["How to Write a Git Commit Message"](https://cbea.ms/git-commit/)
 for general rules on writing a good commit message.

+tl;dr
+> 1. Separate subject from body with a blank line.
+> 2. Limit the subject line to 50 characters.
+>    * [...]shoot for 50 characters, but consider 72 the hard limit.
+> 3. Capitalize the subject line.
+> 4. Do not end the subject line with a period.
+> 5. Use the imperative mood in the subject line.
+>    * A properly formed Git commit subject line should always be able
+>      to complete the following sentence: "If applied, this commit will
+>      _your subject line here_".
+> 6. Wrap the body at 72 characters.
+> 7. Use the body to explain what and why vs. how.
+
 In addition to those guidelines, please add one of the following
 prefixes to the subject line if appropriate.
 * `fix:` - The primary purpose is to fix an existing bug.
@@ -119,7 +142,10 @@ unit tests for Feature X (#1234)`.
 ## Pull requests

 In general, pull requests use `develop` as the base branch.
-(Hotfixes are an exception.)
+The exceptions are
+* Fixes and improvements to a release candidate use `release` as the
+  base.
+* Hotfixes use `master` as the base.

 If your changes are not quite ready, but you want to make it easily available
 for preliminary examination or review, you can create a "Draft" pull request.
@@ -142,14 +168,12 @@ before it can be considered for merge by a Maintainer.
 Maintainers retain discretion to require more approvals if they feel the
 credibility of the existing approvals is insufficient.

-Pull requests must be merged by [squash-and-merge][2]
+Pull requests must be merged by [squash-and-merge][squash]
 to preserve a linear history for the `develop` branch.

-### When and how to merge pull requests
+### "Ready to merge"

-#### "Passed"
-
-A pull request should only have the "Passed" label added when it
+A pull request should only have the "Ready to merge" label added when it
 meets a few criteria:

 1. It must have two approving reviews [as described
@@ -166,142 +190,17 @@ meets a few criteria:
     merge, they should also ensure the commit message(s) are updated
     as well.
 4. The PR branch must be up to date with the base branch (usually
-   `develop`). This is usually accomplised by merging the base branch
+   `develop`). This is usually accomplished by merging the base branch
   into the feature branch, but if the other criteria are met, the
   changes can be squashed and rebased on top of the base branch.
 5. Finally, and most importantly, the author of the PR must
   positively indicate that the PR is ready to merge. That can be
-   accomplished by adding the "Passed" label if their role allows,
-   or by leaving a comment to the effect that the PR is ready to
+   accomplished by adding the "Ready to merge" label if their role
+   allows, or by leaving a comment to the effect that the PR is ready to
   merge.

-Once the "Passed" label is added, a maintainer may merge the PR at
-any time, so don't use it lightly.
-
-#### Instructions for maintainers
-
-The maintainer should double-check that the PR has met all the
-necessary criteria, and can request additional information from the
-owner, or additional reviews, and can always feel free to remove the
-"Passed" label if appropriate. The maintainer has final say on
-whether a PR gets merged, and are encouraged to communicate and
-issues or concerns to other maintainers.
-
-##### Most pull requests: "Squash and merge"
-
-Most pull requests don't need special handling, and can simply be
-merged using the "Squash and merge" button on the Github UI. Update
-the suggested commit message if necessary.
-
-##### Slightly more complicated pull requests
-
-Some pull requests need to be pushed to `develop` as more than one
-commit. There are multiple ways to accomplish this. If the author
-describes a process, and it is reasonable, follow it. Otherwise, do
-a fast forward only merge (`--ff-only`) on the command line and push.
-
-Either way, check that:
-* The commits are based on the current tip of `develop`.
-* The commits are clean: No merge commits (except when reverse
-  merging), no "[FOLD]" or "fixup!" messages.
-* All commits are signed. If the commits are not signed by the author, use
-  `git commit --amend -S` to sign them yourself.
-* At least one (but preferably all) of the commits has the PR number
-  in the commit message.
-
-**Never use the "Create a merge commit" or "Rebase and merge"
- functions!**
-
-##### Releases, release candidates, and betas
-
-All releases, including release candidates and betas, are handled
-differently from typical PRs. Most importantly, never use
-the Github UI to merge a release.
-
-1. There are two possible conditions that the `develop` branch will
-   be in when preparing a release.
-   1. Ready or almost ready to go: There may be one or two PRs that
-      need to be merged, but otherwise, the only change needed is to
-      update the version number in `BuildInfo.cpp`. In this case,
-      merge those PRs as appropriate, updating the second one, and
-      waiting for CI to finish in between. Then update
-      `BuildInfo.cpp`.
-   2. Several pending PRs: In this case, do not use the Github UI,
-      because the delays waiting for CI in between each merge will be
-      unnecessarily onerous. Instead, create a working branch (e.g.
-      `develop-next`) based off of `develop`. Squash the changes
-      from each PR onto the branch, one commit each (unless
-      more are needed), being sure to sign each commit and update
-      the commit message to include the PR number. You may be able
-      to use a fast-forward merge for the first PR. The workflow may
-      look something like:
-```
-git fetch upstream
-git checkout upstream/develop
-git checkout -b develop-next
-# Use -S on the ff-only merge if prbranch1 isn't signed.
-# Or do another branch first.
-git merge --ff-only user1/prbranch1
-git merge --squash user2/prbranch2
-git commit -S
-git merge --squash user3/prbranch3
-git commit -S
-[...]
-git push --set-upstream origin develop-next
-</pre>
-```
-2. Create the Pull Request with `release` as the base branch. If any
-   of the included PRs are still open,
-   [use closing keywords](https://docs.github.com/articles/closing-issues-using-keywords)
-   in the description to ensure they are closed when the code is
-   released. e.g. "Closes #1234"
-3. Instead of the default template, reuse and update the message from
-   the previous release. Include the following verbiage somewhere in
-   the description:
-```
-The base branch is release. All releases (including betas) go in
-release. This PR will be merged with --ff-only (not squashed or
-rebased, and not using the GitHub UI) to both release and develop.
-```
-4. Sign-offs for the three platforms usually occur offline, but at
-   least one approval will be needed on the PR.
-5. Once everything is ready to go, open a terminal, and do the
-   fast-forward merges manually. Do not push any branches until you
-   verify that all of them update correctly.
-```
-git fetch upstream
-git checkout -b upstream--develop -t upstream/develop || git checkout upstream--develop
-git reset --hard upstream/develop
-# develop-next must be signed already!
-git merge --ff-only origin/develop-next
-git checkout -b upstream--release -t upstream/release || git checkout upstream--release
-git reset --hard upstream/release
-git merge --ff-only origin/develop-next
-# Only do these 3 steps if pushing a release. No betas or RCs
-git checkout -b upstream--master -t upstream/master || git checkout upstream--master
-git reset --hard upstream/master
-git merge --ff-only origin/develop-next
-# Check that all of the branches are updated
-git log -1 --oneline
-# The output should look like:
-# 02ec8b7962 (HEAD -> upstream--master, origin/develop-next, upstream--release, upstream--develop, develop-next) Set version to 2.2.0-rc1
-# Note that all of the upstream--develop/release/master are on this commit.
-# (Master will be missing for betas, etc.)
-# Just to be safe, do a dry run first:
-git push --dry-run upstream-push HEAD:develop
-git push --dry-run upstream-push HEAD:release
-# git push --dry-run upstream-push HEAD:master
-# Now push
-git push upstream-push HEAD:develop
-git push upstream-push HEAD:release
-# git push upstream-push HEAD:master
-# Don't forget to tag the release, too.
-git tag <version number>
-git push upstream-push <version number>
-```
-6. Finally
-[create a new release on Github](https://github.com/XRPLF/rippled/releases).
-
+Once the "Ready to merge" label is added, a maintainer may merge the PR
+at any time, so don't use it lightly.

 # Style guide

@@ -312,7 +211,7 @@ coherent rather than a set of _thou shalt not_ commandments.

 ## Formatting

-All code must conform to `clang-format` version 10,
+All code must conform to `clang-format` version 18,
 according to the settings in [`.clang-format`](./.clang-format),
 unless the result would be unreasonably difficult to read or maintain.
 To demarcate lines that should be left as-is, surround them with comments like
@@ -477,17 +376,22 @@ existing maintainer without a vote.

 ## Current Maintainers

-Maintainers are users with admin access to the repo. Maintainers do not typically approve or deny pull requests.
+Maintainers are users with maintain or admin access to the repo.

+* [bthomee](https://github.com/bthomee) (Ripple)
 * [intelliot](https://github.com/intelliot) (Ripple)
 * [JoelKatz](https://github.com/JoelKatz) (Ripple)
 * [nixer89](https://github.com/nixer89) (XRP Ledger Foundation)
+* [RichardAH](https://github.com/RichardAH) (XRP Ledger Foundation)
 * [Silkjaer](https://github.com/Silkjaer) (XRP Ledger Foundation)
 * [WietseWind](https://github.com/WietseWind) (XRPL Labs + XRP Ledger Foundation)
+* [ximinez](https://github.com/ximinez) (Ripple)
+

 ## Current Code Reviewers

-Code Reviewers are developers who have the ability to review and approve source code changes.
+Code Reviewers are developers who have the ability to review, approve, and
+in some cases merge source code changes.

 * [HowardHinnant](https://github.com/HowardHinnant) (Ripple)
 * [scottschurr](https://github.com/scottschurr) (Ripple)
@@ -511,6 +415,607 @@ Code Reviewers are developers who have the ability to review and approve source
 * [RichardAH](https://github.com/RichardAH) (XRPL Labs + XRP Ledger Foundation)
 * [dangell7](https://github.com/dangell7) (XRPL Labs)

+Developers not on this list are able and encouraged to submit feedback
+on pending code changes (open pull requests).

-[1]: https://docs.github.com/en/get-started/quickstart/contributing-to-projects
-[2]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits
+## Instructions for maintainers
+
+These instructions assume you have your git upstream remotes configured
+to avoid accidental pushes to the main repo, and a remote group
+specifying both of them. e.g.
+```
+$ git remote -v | grep upstream
+upstream        https://github.com/XRPLF/rippled.git (fetch)
+upstream        https://github.com/XRPLF/rippled.git (push)
+upstream-push   git@github.com:XRPLF/rippled.git (fetch)
+upstream-push   git@github.com:XRPLF/rippled.git (push)
+
+$ git config remotes.upstreams
+upstream upstream-push
+```
+
+You can use the [setup-upstreams] script to set this up.
+
+It also assumes you have a default gpg signing key set up in git. e.g.
+```
+$ git config user.signingkey
+968479A1AFF927E37D1A566BB5690EEEBB952194
+# (This is github's key. Use your own.)
+```
+
+### When and how to merge pull requests
+
+The maintainer should double-check that the PR has met all the
+necessary criteria, and can request additional information from the
+owner, or additional reviews, and can always feel free to remove the
+"Ready to merge" label if appropriate. The maintainer has final say on
+whether a PR gets merged, and are encouraged to communicate and issues
+or concerns to other maintainers.
+
+#### Most pull requests: "Squash and merge"
+
+Most pull requests don't need special handling, and can simply be
+merged using the "Squash and merge" button on the Github UI. Update
+the suggested commit message, or modify it as needed.
+
+#### Slightly more complicated pull requests
+
+Some pull requests need to be pushed to `develop` as more than one
+commit. A PR author may *request* to merge as separate commits. They
+must *justify* why separate commits are needed, and *specify* how they
+would like the commits to be merged. If you disagree with the author,
+discuss it with them directly.
+
+If the process is reasonable, follow it. The simplest option is to do a
+fast forward only merge (`--ff-only`) on the command line and push to
+`develop`.
+
+Some examples of when separate commits are worthwhile are:
+1. PRs where source files are reorganized in multiple steps.
+2. PRs where the commits are mostly independent and *could* be separate
+   PRs, but are pulled together into one PR under a commit theme or
+   issue.
+3. PRs that are complicated enough that `git bisect` would not be much
+   help if it determined this PR introduced a problem.
+
+Either way, check that:
+* The commits are based on the current tip of `develop`.
+* The commits are clean: No merge commits (except when reverse
+  merging), no "[FOLD]" or "fixup!" messages.
+* All commits are signed. If the commits are not signed by the author, use
+  `git commit --amend -S` to sign them yourself.
+* At least one (but preferably all) of the commits has the PR number
+  in the commit message.
+
+The "Create a merge commit" and "Rebase and merge" options should be
+disabled in the Github UI, but if you ever find them available **Do not
+use them!**
+
+### Releases
+
+All releases, including release candidates and betas, are handled
+differently from typical PRs. Most importantly, never use
+the Github UI to merge a release.
+
+Rippled uses a linear workflow model that can be summarized as:
+
+1. In between releases, developers work against the `develop` branch.
+2. Periodically, a maintainer will build and tag a beta version from
+   `develop`, which is pushed to `release`.
+   * Betas are usually released every two to three weeks, though that
+     schedule can vary depending on progress, availability, and other
+     factors.
+3. When the changes in `develop` are considered stable and mature enough
+   to be ready to release, a release candidate (RC) is built and tagged
+   from `develop`, and merged to `release`.
+   * Further development for that release (primarily fixes) then
+     continues against `release`, while other development continues on
+     `develop`. Effectively, `release` is forked from `develop`. Changes
+     to `release` must be reverse merged to `develop`.
+4. When the candidate has passed testing and is ready for release, the
+   final release is merged to `master`.
+5. If any issues are found post-release, a hotfix / point release may be
+   created, which is merged to `master`, and then reverse merged to
+   `develop`.
+
+#### Betas, and the first release candidate
+
+##### Preparing the `develop` branch
+
+1. Optimally, the `develop` branch will be ready to go, with all
+   relevant PRs already merged.
+2. If there are any PRs pending, merge them **BEFORE** preparing the beta.
+   1. If only one or two PRs need to be merged, merge those PRs [as
+      normal](#when-and-how-to-merge-pull-requests), updating the second
+      one, and waiting for CI to finish in between.
+   2. If there are several pending PRs, do not use the Github UI,
+      because the delays waiting for CI in between each merge will be
+      unnecessarily onerous. (Incidentally, this process can also be
+      used to merge if the Github UI has issues.) Merge each PR branch
+      directly to a `release-next` on your local machine and create a single
+      PR, then push your branch to `develop`.
+      1. Squash the changes from each PR, one commit each (unless more
+         are needed), being sure to sign each commit and update the
+         commit message to include the PR number. You may be able to use
+         a fast-forward merge for the first PR.
+      2. Push your branch.
+      3. Continue to [Making the release](#making-the-release) to update
+         the version number, etc.
+
+      The workflow may look something like:
+```
+git fetch --multiple upstreams user1 user2 user3 [...]
+git checkout -B release-next --no-track upstream/develop
+
+# Only do an ff-only merge if prbranch1 is either already
+# squashed, or needs to be merged with separate commits,
+# and has no merge commits.
+# Use -S on the ff-only merge if prbranch1 isn't signed.
+git merge [-S] --ff-only user1/prbranch1
+
+git merge --squash user2/prbranch2
+git commit -S # Use the commit message provided on the PR
+
+git merge --squash user3/prbranch3
+git commit -S # Use the commit message provided on the PR
+
+[...]
+
+# Make sure the commits look right
+git log --show-signature "upstream/develop..HEAD"
+
+git push --set-upstream origin
+
+# Continue to "Making the release" to update the version number, so
+# everything can be done in one PR.
+```
+
+You can also use the [squash-branches] script.
+
+You may also need to manually close the open PRs after the changes are
+merged to `develop`. Be sure to include the commit ID.
+
+##### Making the release
+
+This includes, betas, and the first release candidate (RC).
+
+1. If you didn't create one [preparing the `develop`
+   branch](#preparing-the-develop-branch), Ensure there is no old
+   `release-next` branch hanging around.  Then make a `release-next`
+   branch that only changes the version number. e.g.
+```
+git fetch upstreams
+
+git checkout --no-track -B release-next upstream/develop
+
+v="A.B.C-bD"
+build=$( find -name BuildInfo.cpp )
+sed 's/\(^.*versionString =\).*$/\1 "'${v}'"/' ${build} > version.cpp && mv -vi version.cpp ${build}
+
+git diff
+
+git add ${build}
+
+git commit -S -m "Set version to ${v}"
+
+# You could use your "origin" repo, but some CI tests work better on upstream.
+git push upstream-push
+git fetch upstreams
+git branch --set-upstream-to=upstream/release-next
+```
+   You can also use the [update-version] script.
+2. Create a Pull Request for `release-next` with **`develop`** as
+   the base branch.
+   1. Use the title "[TRIVIAL] Set version to X.X.X-bX".
+   2. Instead of the default description template, use the following:
+```
+## High Level Overview of Change
+
+This PR only changes the version number. It will be merged as
+soon as Github CI actions successfully complete.
+```
+3. Wait for CI to successfully complete, and get someone to approve
+   the PR. (It is safe to ignore known CI issues.)
+4. Push the updated `develop` branch using your `release-next`
+   branch. **Do not use the Github UI. It's important to preserve
+   commit IDs.**
+```
+git push upstream-push release-next:develop
+```
+5. In the unlikely event that the push fails because someone has merged
+   something else in the meantime, rebase your branch onto the updated
+   `develop` branch, push again, and go back to step 3.
+6. Ensure that your PR against `develop` is closed. Github should do it
+   automatically.
+7. Once this is done, forward progress on `develop` can continue
+   (other PRs may be merged).
+8. Now create a Pull Request for `release-next` with **`release`** as
+   the base branch.  Instead of the default template, reuse and update
+   the message from the previous release. Include the following verbiage
+   somewhere in the description:
+```
+The base branch is `release`. [All releases (including
+betas)](https://github.com/XRPLF/rippled/blob/develop/CONTRIBUTING.md#before-you-start)
+go in `release`. This PR branch will be pushed directly to `release` (not
+squashed or rebased, and not using the GitHub UI).
+```
+7. Sign-offs for the three platforms (Linux, Mac, Windows) usually occur
+   offline, but at least one approval will be needed on the PR.
+   * If issues are discovered during testing, simply abandon the
+     release.  It's easy to start a new release, it should be easy to
+     abandon one. **DO NOT REUSE THE VERSION NUMBER.** e.g. If you
+     abandon 2.4.0-b1, the next attempt will be 2.4.0-b2.
+8. Once everything is ready to go, push to `release`.
+```
+git fetch upstreams
+
+# Just to be safe, do a dry run first:
+git push --dry-run upstream-push release-next:release
+
+# If everything looks right, push the branch
+git push upstream-push release-next:release
+
+# Check that all of the branches are updated
+git fetch upstreams
+git log -1 --oneline
+# The output should look like:
+# 0123456789 (HEAD -> upstream/release-next, upstream/release,
+#            upstream/develop) Set version to 2.4.0-b1
+# Note that upstream/develop may not be on this commit, but
+# upstream/release must be.
+# Other branches, including some from upstream-push, may also be
+# present.
+```
+9. Tag the release, too.
+```
+git tag <version number>
+git push upstream-push <version number>
+```
+10. Delete the `release-next` branch on the repo. Use the Github UI or:
+```
+git push --delete upstream-push release-next
+```
+11. Finally [create a new release on
+    Github](https://github.com/XRPLF/rippled/releases).
+
+#### Release candidates after the first
+
+Once the first release candidate is [merged into
+release](#making-the-release), then `release` and `develop` *are allowed
+to diverge*.
+
+If a bug or issue is discovered in a version that has a release
+candidate being tested, any fix and new version will need to be applied
+against `release`, then reverse-merged to `develop`. This helps keep git
+history as linear as possible.
+
+A `release-next` branch will be created from `release`, and any further
+work for that release must be based on `release-next`.  Specifically,
+PRs must use `release-next` as the base, and those PRs will be merged
+directly to `release-next` when approved. Changes should be restricted
+to bug fixes, but other changes may be necessary from time to time.
+
+1. Open any PRs for the pending release using `release-next` as the base,
+   so they can be merged directly in to it. Unlike `develop`, though,
+   `release-next` can be thrown away and recreated if necessary.
+2. Once a new release candidate is ready, create a version commit as in
+   step 1 [above](#making-the-release) on `release-next`. You can use
+   the [update-version] script for this, too.
+3. Jump to step 8 ("Now create a Pull Request for `release-next` with
+   **`release`** as the base") from the process
+   [above](#making-the-release) to merge `release-next` into `release`.
+
+##### Follow up: reverse merge
+
+Once the RC is merged and tagged, it needs to be reverse merged into
+`develop` as soon as possible.
+
+1. Create a branch, based on `upstream/develop`.
+   The branch name is not important, but could include "mergeNNNrcN".
+   E.g. For release A.B.C-rcD, use `mergeABCrcD`.
+```
+git fetch upstreams
+
+git checkout --no-track -b mergeABCrcD upstream/develop
+```
+2. Merge `release` into your branch.
+```
+# I like the "--edit --log --verbose" parameters, but they are
+# not required.
+git merge upstream/release
+```
+3. `BuildInfo.cpp` will have a conflict with the version number.
+   Resolve it with the version from `develop` - the higher version.
+4. Push your branch to your repo (or `upstream` if you have permission),
+   and open a normal PR against `develop`. The "High level overview" can
+   simply indicate that this is a merge of the RC. The "Context" should
+   summarize the changes from the RC. Include the following text
+   prominently:
+```
+This PR must be merged manually using a push. Do not use the Github UI.
+```
+5. Depending on the complexity of the changes, and/or merge conflicts,
+   the PR may need a thorough review, or just a sign-off that the
+   merge was done correctly.
+6. If `develop` is updated before this PR is merged, do not merge
+   `develop` back into your branch. Instead rebase preserving merges,
+   or do the merge again. (See also the `rerere` git config setting.)
+```
+git rebase --rebase-merges upstream/develop
+# OR
+git reset --hard upstream/develop
+git merge upstream/release
+```
+7. When the PR is ready, push it to `develop`.
+```
+git fetch upstreams
+
+# Make sure the commits look right
+git log --show-signature "upstream/develop^..HEAD"
+
+git push upstream-push mergeABCrcD:develop
+
+git fetch upstreams
+```
+Development on `develop` can proceed as normal.
+
+
+#### Final releases
+
+A final release is any release that is not a beta or RC, such as 2.2.0.
+
+Only code that has already been tested and vetted across all three
+platforms should be included in a final release. Most of the time, that
+means that the commit immediately preceding the commit setting the
+version number will be an RC. Occasionally, there may be last-minute bug
+fixes included as well. If so, those bug fixes must have been tested
+internally as if they were RCs (at minimum, ensuring unit tests pass,
+and the app starts, syncs, and stops cleanly across all three
+platforms.)
+
+*If in doubt, make an RC first.*
+
+The process for building a final release is very similar to [the process
+for building a beta](#making-the-release), except the code will be
+moving from `release` to `master` instead of from `develop` to
+`release`, and both branches will be pushed at the same time.
+
+1. Ensure there is no old `master-next` branch hanging around.
+   Then make a `master-next` branch that only changes the version
+   number. As above, or using the
+   [update-version] script.
+2. Create a Pull Request for `master-next` with **`master`** as
+   the base branch.  Instead of the default template, reuse and update
+   the message from the previous final release. Include the following verbiage
+   somewhere in the description:
+```
+The base branch is `master`. This PR branch will be pushed directly to
+`release` and `master` (not squashed or rebased, and not using the
+GitHub UI).
+```
+7. Sign-offs for the three platforms (Linux, Mac, Windows) usually occur
+   offline, but at least one approval will be needed on the PR.
+   * If issues are discovered during testing, close the PR, delete
+     `master-next`, and move development back to `release`, [issuing
+     more RCs as necessary](#release-candidates-after-the-first)
+8. Once everything is ready to go, push to `release` and `master`.
+```
+git fetch upstreams
+
+# Just to be safe, do dry runs first:
+git push --dry-run upstream-push master-next:release
+git push --dry-run upstream-push master-next:master
+
+# If everything looks right, push the branch
+git push upstream-push master-next:release
+git push upstream-push master-next:master
+
+# Check that all of the branches are updated
+git fetch upstreams
+git log -1 --oneline
+# The output should look like:
+# 0123456789 (HEAD -> upstream/master-next, upstream/master,
+#            upstream/release) Set version to A.B.0
+# Note that both upstream/release and upstream/master must be on this
+# commit.
+# Other branches, including some from upstream-push, may also be
+# present.
+```
+9. Tag the release, too.
+```
+git tag <version number>
+git push upstream-push <version number>
+```
+10. Delete the `master-next` branch on the repo. Use the Github UI or:
+```
+git push --delete upstream-push master-next
+```
+11. [Create a new release on
+    Github](https://github.com/XRPLF/rippled/releases). Be sure that
+    "Set as the latest release" is checked.
+12. Finally [reverse merge the release into `develop`](#follow-up-reverse-merge).
+
+#### Special cases: point releases, hotfixes, etc.
+
+On occassion, a bug or issue is discovered in a version that already
+had a final release. Most of the time, development will have started
+on the next version, and will usually have changes in `develop`
+and often in `release`.
+
+Because git history is kept as linear as possible, any fix and new
+version will need to be applied against `master`.
+
+The process for building a hotfix release is very similar to [the
+process for building release candidates after the
+first](#release-candidates-after-the-first) and [for building a final
+release](#final-releases), except the changes will be done against
+`master` instead of `release`.
+
+If there is only a single issue for the hotfix, the work can be done in
+any branch. When it's ready to merge, jump to step 3 using your branch
+instead of `master-next`.
+
+1. Create a `master-next` branch from `master`.
+```
+git checkout --no-track -b master-next upstream/master
+git push upstream-push
+git fetch upstreams
+```
+2. Open any PRs for the pending hotfix using `master-next` as the base,
+   so they can be merged directly in to it. Unlike `develop`, though,
+   `master-next` can be thrown away and recreated if necessary.
+3. Once the hotfix is ready, create a version commit using the same
+   steps as above, or use the
+   [update-version] script.
+4. Create a Pull Request for `master-next` with **`master`** as
+   the base branch.  Instead of the default template, reuse and update
+   the message from the previous final release. Include the following verbiage
+   somewhere in the description:
+```
+The base branch is `master`. This PR branch will be pushed directly to
+`master` (not squashed or rebased, and not using the GitHub UI).
+```
+7. Sign-offs for the three platforms (Linux, Mac, Windows) usually occur
+   offline, but at least one approval will be needed on the PR.
+   * If issues are discovered during testing, update `master-next` as
+     needed, but ensure that the changes are properly squashed, and the
+     version setting commit remains last
+8. Once everything is ready to go, push to `master` **only**.
+```
+git fetch upstreams
+
+# Just to be safe, do a dry run first:
+git push --dry-run upstream-push master-next:master
+
+# If everything looks right, push the branch
+git push upstream-push master-next:master
+
+# Check that all of the branches are updated
+git fetch upstreams
+git log -1 --oneline
+# The output should look like:
+# 0123456789 (HEAD -> upstream/master-next, upstream/master) Set version
+#            to 2.4.1
+# Note that upstream/master must be on this commit. upstream/release and
+# upstream/develop should not.
+# Other branches, including some from upstream-push, may also be
+# present.
+```
+9. Tag the release, too.
+```
+git tag <version number>
+git push upstream-push <version number>
+```
+9. Delete the `master-next` branch on the repo.
+```
+git push --delete upstream-push master-next
+```
+10. [Create a new release on
+    Github](https://github.com/XRPLF/rippled/releases). Be sure that
+    "Set as the latest release" is checked.
+
+Once the hotfix is released, it needs to be reverse merged into
+`develop` as soon as possible. It may also need to be merged into
+`release` if a release candidate is under development.
+
+1. Create a branch in your own repo, based on `upstream/develop`.
+   The branch name is not important, but could include "mergeNNN".
+   E.g. For release 2.2.3, use `merge223`.
+```
+git fetch upstreams
+
+git checkout --no-track -b merge223 upstream/develop
+```
+2. Merge master into your branch.
+```
+# I like the "--edit --log --verbose" parameters, but they are
+# not required.
+git merge upstream/master
+```
+3. `BuildInfo.cpp` will have a conflict with the version number.
+   Resolve it with the version from `develop` - the higher version.
+4. Push your branch to your repo, and open a normal PR against
+   `develop`. The "High level overview" can simply indicate that this
+   is a merge of the hotfix version. The "Context" should summarize
+   the changes from the hotfix. Include the following text
+   prominently:
+```
+This PR must be merged manually using a --ff-only merge. Do not use the Github UI.
+```
+5. Depending on the complexity of the hotfix, and/or merge conflicts,
+   the PR may need a thorough review, or just a sign-off that the
+   merge was done correctly.
+6. If `develop` is updated before this PR is merged, do not merge
+   `develop` back into your branch. Instead rebase preserving merges,
+   or do the merge again. (See also the `rerere` git config setting.)
+```
+git rebase --rebase-merges upstream/develop
+# OR
+git reset --hard upstream/develop
+git merge upstream/master
+```
+7. When the PR is ready, push it to `develop`.
+```
+git fetch upstreams
+
+# Make sure the commits look right
+git log --show-signature "upstream/develop..HEAD"
+
+git push upstream-push HEAD:develop
+```
+Development on `develop` can proceed as normal. It is recommended to
+create a beta (or RC) immediately to ensure that everything worked as
+expected.
+
+##### An even rarer scenario: A hotfix on an old release
+
+Historically, once a final release is tagged and packages are released,
+versions older than the latest final release are no longer supported.
+However, there is a possibility that a very high severity bug may occur
+in a non-amendment blocked version that is still being run by
+a significant fraction of users, which would necessitate a hotfix / point
+release to that version as well as any later versions.
+
+This scenario would follow the same basic procedure as above,
+except that *none* of `develop`, `release`, or `master`
+would be touched during the release process.
+
+In this example, consider if version 2.1.1 needed to be patched.
+
+1. Create two branches in the main (`upstream`) repo.
+```
+git fetch upstreams
+
+# Create a base branch off the tag
+git checkout --no-track -b master-2.1.2 2.1.1
+git push upstream-push
+
+# Create a working branch
+git checkout --no-track -b master212-next master-2.1.2
+git push upstream-push
+
+git fetch upstreams
+```
+2. Work continues as above, except using `master-2.1.2`as
+   the base branch for any merging, packaging, etc.
+3. After the release is tagged and packages are built, you could
+   potentially delete both branches, e.g. `master-2.1.2` and
+   `master212-next`. However, it may be useful to keep `master-2.1.2`
+   around indefinitely for reference.
+4. Assuming that a hotfix is also released for the latest
+   version in parallel with this one, or if the issue is
+   already fixed in the latest version, do no do any
+   reverse merges. However, if it is not, it probably makes
+   sense to reverse merge `master-2.1.2` into `master`,
+   release a hotfix for _that_ version, then reverse merge
+   from `master` to `develop`. (Please don't do this unless absolutely
+   necessary.)
+
+[contrib]: https://docs.github.com/en/get-started/quickstart/contributing-to-projects
+[squash]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits
+[forking]: https://github.com/XRPLF/rippled/fork
+[rippled]: https://github.com/XRPLF/rippled
+[signing]: https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification
+[setup-upstreams]: ./bin/git/setup-upstreams.sh
+[squash-branches]: ./bin/git/squash-branches.sh
+[update-version]: ./bin/git/update-version.sh
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![codecov](https://codecov.io/gh/XRPLF/rippled/graph/badge.svg?token=WyFr5ajq3O)](https://codecov.io/gh/XRPLF/rippled)
+
 # The XRP Ledger

 The [XRP Ledger](https://xrpl.org/) is a decentralized cryptographic ledger powered by a network of peer-to-peer nodes. The XRP Ledger uses a novel Byzantine Fault Tolerant consensus algorithm to settle and record transactions in a secure distributed database without a central operator.
--- a/RELEASENOTES.md
+++ b/RELEASENOTES.md
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -83,7 +83,7 @@ To report a qualifying bug, please send a detailed report to:
 |Long Key ID  | `0xCD49A0AFC57929BE`                                |
 |Fingerprint  | `24E6 3B02 37E0 FA9C 5E96 8974 CD49 A0AF C579 29BE` |

-The full PGP key for this address, which is also available on several key servers (e.g. on [keys.gnupg.net](https://keys.gnupg.net)), is: 
+The full PGP key for this address, which is also available on several key servers (e.g. on [keyserver.ubuntu.com](https://keyserver.ubuntu.com)), is: 
 ```
 -----BEGIN PGP PUBLIC KEY BLOCK-----
 mQINBFUwGHYBEAC0wpGpBPkd8W1UdQjg9+cEFzeIEJRaoZoeuJD8mofwI5Ejnjdt
--- a/bin/git/setup-upstreams.sh
+++ b/bin/git/setup-upstreams.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+if [[ $# -ne 1 || "$1" == "--help" || "$1" == "-h" ]]
+then
+  name=$( basename $0 )
+  cat <<- USAGE
+  Usage: $name <username>
+  
+  Where <username> is the Github username of the upstream repo. e.g. XRPLF
+USAGE
+  exit 0
+fi
+
+# Create upstream remotes based on origin
+shift
+user="$1"
+# Get the origin URL. Expect it be an SSH-style URL
+origin=$( git remote get-url origin )
+if [[ "${origin}" == "" ]]
+then
+  echo Invalid origin remote >&2
+  exit 1
+fi
+# echo "Origin: ${origin}"
+# Parse the origin
+ifs_orig="${IFS}"
+IFS=':' read remote originpath <<< "${origin}"
+# echo "Remote: ${remote}, Originpath: ${originpath}"
+IFS='@' read sshuser server <<< "${remote}"
+# echo "SSHUser: ${sshuser}, Server: ${server}"
+IFS='/' read originuser repo <<< "${originpath}"
+# echo "Originuser: ${originuser}, Repo: ${repo}"
+if [[ "${sshuser}" == "" || "${server}" == "" || "${originuser}" == ""
+  || "${repo}" == "" ]]
+then
+  echo "Can't parse origin URL: ${origin}" >&2
+  exit 1
+fi
+upstream="https://${server}/${user}/${repo}"
+upstreampush="${remote}:${user}/${repo}"
+upstreamgroup="upstream upstream-push"
+current=$( git remote get-url upstream 2>/dev/null )
+currentpush=$( git remote get-url upstream-push 2>/dev/null )
+currentgroup=$( git config remotes.upstreams )
+if [[ "${current}" == "${upstream}" ]]
+then
+  echo "Upstream already set up correctly. Skip"
+elif [[ -n "${current}" && "${current}" != "${upstream}" &&
+  "${current}" != "${upstreampush}" ]]
+then
+  echo "Upstream already set up as: ${current}. Skip"
+else
+  if [[ "${current}" == "${upstreampush}" ]]
+  then
+    echo "Upstream set to dangerous push URL. Update."
+    _run git remote rename upstream upstream-push || \
+    _run git remote remove upstream
+    currentpush=$( git remote get-url upstream-push 2>/dev/null )
+  fi
+  _run git remote add upstream "${upstream}"
+fi
+
+if [[ "${currentpush}" == "${upstreampush}" ]]
+then
+  echo "upstream-push already set up correctly. Skip"
+elif [[ -n "${currentpush}" && "${currentpush}" != "${upstreampush}" ]]
+then
+  echo "upstream-push already set up as: ${currentpush}. Skip"
+else
+  _run git remote add upstream-push "${upstreampush}"
+fi
+
+if [[ "${currentgroup}" == "${upstreamgroup}" ]]
+then
+  echo "Upstreams group already set up correctly. Skip"
+elif [[ -n "${currentgroup}" && "${currentgroup}" != "${upstreamgroup}" ]]
+then
+  echo "Upstreams group already set up as: ${currentgroup}. Skip"
+else
+  _run git config --add remotes.upstreams "${upstreamgroup}"
+fi
+
+_run git fetch --jobs=$(nproc) upstreams
+
+exit 0
+
--- a/bin/git/squash-branches.sh
+++ b/bin/git/squash-branches.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+if [[ $# -lt 3 || "$1" == "--help" || "$1" = "-h" ]]
+then
+  name=$( basename $0 )
+  cat <<- USAGE
+  Usage: $name workbranch base/branch user/branch [user/branch [...]]
+  
+  * workbranch will be created locally from base/branch
+  * base/branch and user/branch may be specified as user:branch to allow
+    easy copying from Github PRs
+  * Remotes for each user must already be set up
+USAGE
+exit 0
+fi
+
+work="$1"
+shift
+
+branches=( $( echo "${@}" | sed "s/:/\//" ) )
+base="${branches[0]}"
+unset branches[0]
+
+set -e
+
+users=()
+for b in "${branches[@]}"
+do
+  users+=( $( echo $b | cut -d/ -f1 ) )
+done
+
+users=( $( printf '%s\n' "${users[@]}" | sort -u ) )
+
+git fetch --multiple upstreams "${users[@]}"
+git checkout -B "$work" --no-track "$base"
+
+for b in "${branches[@]}"
+do
+  git merge --squash "${b}"
+  git commit -S # Use the commit message provided on the PR
+done
+
+# Make sure the commits look right
+git log --show-signature "$base..HEAD"
+
+parts=( $( echo $base | sed "s/\// /" ) )
+repo="${parts[0]}"
+b="${parts[1]}"
+push=$repo
+if [[ "$push" == "upstream" ]]
+then
+  push="upstream-push"
+fi
+if [[ "$repo" == "upstream" ]]
+then
+  repo="upstreams"
+fi
+cat << PUSH
+
+-------------------------------------------------------------------
+This script will not push. Verify everything is correct, then push
+to your repo, and create a PR if necessary. Once the PR is approved,
+run:
+
+git push $push HEAD:$b
+git fetch $repo
+-------------------------------------------------------------------
+PUSH
+
--- a/bin/git/update-version.sh
+++ b/bin/git/update-version.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [[ $# -ne 3 || "$1" == "--help" || "$1" = "-h" ]]
+then
+  name=$( basename $0 )
+  cat <<- USAGE
+  Usage: $name workbranch base/branch version
+
+  * workbranch will be created locally from base/branch. If it exists,
+    it will be reused, so make sure you don't overwrite any work.
+  * base/branch may be specified as user:branch to allow easy copying
+    from Github PRs.
+USAGE
+exit 0
+fi
+
+work="$1"
+shift
+
+base=$( echo "$1" | sed "s/:/\//" )
+shift
+
+version=$1
+shift
+
+set -e
+
+git fetch upstreams
+
+git checkout -B "${work}" --no-track "${base}"
+
+push=$( git rev-parse --abbrev-ref --symbolic-full-name '@{push}' \
+              2>/dev/null ) || true
+if [[ "${push}" != "" ]]
+then
+  echo "Warning: ${push} may already exist."
+fi
+
+build=$( find -name BuildInfo.cpp )
+sed 's/\(^.*versionString =\).*$/\1 "'${version}'"/' ${build} > version.cpp && \
+diff "${build}" version.cpp && exit 1 || \
+mv -vi version.cpp ${build}
+
+git diff
+
+git add ${build}
+
+git commit -S -m "Set version to ${version}"
+
+git log --oneline --first-parent ${base}^..
+
+cat << PUSH
+
+-------------------------------------------------------------------
+This script will not push. Verify everything is correct, then push
+to your repo, and create a PR as described in CONTRIBUTING.md.
+-------------------------------------------------------------------
+PUSH
--- a/cfg/rippled-example.cfg
+++ b/cfg/rippled-example.cfg
@@ -410,14 +410,17 @@
 #   starter list is included in the code and used if no other hostnames are
 #   available.
 #
-#   One address or domain name per line is allowed. A port may must be
-#   specified after adding a space to the address.  The ordering of entries
-#   does not generally matter.
+#   One address or domain name per line is allowed. A port may be specified
+#   after adding a space to the address. If a port is not specified, the default
+#   port of 2459 will be used. Many servers still use the legacy port of 51235.
+#   To connect to such servers, you must specify the port number. The ordering
+#   of entries does not generally matter.
 #
 #   The default list of entries is:
 #     - r.ripple.com 51235
 #     - sahyadri.isrdc.in 51235
 #     - hubs.xrpkuwait.com 51235
+#     - hub.xrpl-commons.org 51235
 #
 #   Examples:
 #
@@ -1423,6 +1426,9 @@ admin = 127.0.0.1
 protocol = http

 [port_peer]
+# Many servers still use the legacy port of 51235, so for backward-compatibility
+# we maintain that port number here. However, for new servers we recommend
+# changing this to the default port of 2459.
 port = 51235
 ip = 0.0.0.0
 # alternatively, to accept connections on IPv4 + IPv6, use:
--- a/cfg/validators-example.txt
+++ b/cfg/validators-example.txt
@@ -26,7 +26,7 @@
 #
 #   Examples:
 #    https://vl.ripple.com
-#    https://vl.xrplf.org
+#    https://unl.xrplf.org
 #    http://127.0.0.1:8000
 #    file:///etc/opt/ripple/vl.txt
 #
@@ -54,13 +54,13 @@

 [validator_list_sites]
 https://vl.ripple.com
-https://vl.xrplf.org
+https://unl.xrplf.org

 [validator_list_keys]
 #vl.ripple.com
 ED2677ABFFD1B33AC6FBC3062B71F1E8397C1505E1C42C64D11AD1B28FF73F4734
-# vl.xrplf.org
-ED45D1840EE724BE327ABE9146503D5848EFD5F38B6D5FEDE71E80ACCE5E6E738B
+#unl.xrplf.org
+ED42AEC58B701EEBB77356FFFEC26F83C1F0407263530F068C7C73D392C7E06FD1

 # To use the test network (see https://xrpl.org/connect-your-rippled-to-the-xrp-test-net.html),
 # use the following configuration instead:
@@ -70,3 +70,21 @@ ED45D1840EE724BE327ABE9146503D5848EFD5F38B6D5FEDE71E80ACCE5E6E738B
 #
 # [validator_list_keys]
 # ED264807102805220DA0F312E71FC2C69E1552C9C5790F6C25E3729DEB573D5860
+
+
+# [validator_list_threshold]
+#
+#   Minimum number of validator lists on which a validator must be listed in
+#   order to be used.
+#
+#   This can be set explicitly to any positive integer number not greater than
+#   the size of [validator_list_keys]. If it is not set, or set to 0, the
+#   value will be calculated at startup from the size of [validator_list_keys],
+#   where the calculation is:
+#
+#     threshold = size(validator_list_keys) < 3
+#       ? 1
+#       : floor(size(validator_list_keys) / 2) + 1
+
+[validator_list_threshold]
+0
--- a/cmake/CodeCoverage.cmake
+++ b/cmake/CodeCoverage.cmake
@@ -98,6 +98,9 @@
 # 2024-04-03, Bronek Kozicki
 #     - add support for output formats: jacoco, clover, lcov
 #
+# 2025-05-12, Jingchen Wu
+#     - add -fprofile-update=atomic to ensure atomic profile generation
+#
 # USAGE:
 #
 # 1. Copy this file into your cmake modules path.
@@ -200,15 +203,27 @@ set(COVERAGE_COMPILER_FLAGS "-g --coverage"
    CACHE INTERNAL "")
 if(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Clang)")
    include(CheckCXXCompilerFlag)
+    include(CheckCCompilerFlag)
+
    check_cxx_compiler_flag(-fprofile-abs-path HAVE_cxx_fprofile_abs_path)
    if(HAVE_cxx_fprofile_abs_path)
        set(COVERAGE_CXX_COMPILER_FLAGS "${COVERAGE_COMPILER_FLAGS} -fprofile-abs-path")
    endif()
-    include(CheckCCompilerFlag)
+
    check_c_compiler_flag(-fprofile-abs-path HAVE_c_fprofile_abs_path)
    if(HAVE_c_fprofile_abs_path)
        set(COVERAGE_C_COMPILER_FLAGS "${COVERAGE_COMPILER_FLAGS} -fprofile-abs-path")
    endif()
+
+    check_cxx_compiler_flag(-fprofile-update HAVE_cxx_fprofile_update)
+    if(HAVE_cxx_fprofile_update)
+        set(COVERAGE_CXX_COMPILER_FLAGS "${COVERAGE_COMPILER_FLAGS}  -fprofile-update=atomic")
+    endif()
+
+    check_c_compiler_flag(-fprofile-update HAVE_c_fprofile_update)
+    if(HAVE_c_fprofile_update)
+        set(COVERAGE_C_COMPILER_FLAGS "${COVERAGE_COMPILER_FLAGS} -fprofile-update=atomic")
+    endif()
 endif()

 set(CMAKE_Fortran_FLAGS_COVERAGE
--- a/cmake/RippledCore.cmake
+++ b/cmake/RippledCore.cmake
@@ -9,6 +9,7 @@ include(target_protobuf_sources)
 # define a bunch of `static const` variables with the same names,
 # so we just build them as a separate library.
 add_library(xrpl.libpb)
+set_target_properties(xrpl.libpb PROPERTIES UNITY_BUILD OFF)
 target_protobuf_sources(xrpl.libpb xrpl/proto
  LANGUAGE cpp
  IMPORT_DIRS include/xrpl/proto
@@ -49,7 +50,9 @@ target_link_libraries(xrpl.libpb

 # TODO: Clean up the number of library targets later.
 add_library(xrpl.imports.main INTERFACE)
-target_link_libraries(xrpl.imports.main INTERFACE
+
+target_link_libraries(xrpl.imports.main
+  INTERFACE
    LibArchive::LibArchive
    OpenSSL::Crypto
    Ripple::boost
@@ -59,7 +62,10 @@ target_link_libraries(xrpl.imports.main INTERFACE
    date::date
    ed25519::ed25519
    secp256k1::secp256k1
+    xrpl.libpb
    xxHash::xxhash
+    blake3
+    $<$<BOOL:${voidstar}>:antithesis-sdk-cpp>
 )

 include(add_module)
@@ -100,9 +106,6 @@ target_link_libraries(xrpl.libxrpl.server PUBLIC xrpl.libxrpl.protocol)

 add_library(xrpl.libxrpl)
 set_target_properties(xrpl.libxrpl PROPERTIES OUTPUT_NAME xrpl)
-if(unity)
-  set_target_properties(xrpl.libxrpl PROPERTIES UNITY_BUILD ON)
-endif()

 add_library(xrpl::libxrpl ALIAS xrpl.libxrpl)

@@ -130,41 +133,13 @@ target_link_modules(xrpl PUBLIC
 #     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 #     $<INSTALL_INTERFACE:include>)

-target_compile_definitions(xrpl.libxrpl
-  PUBLIC
-    BOOST_ASIO_USE_TS_EXECUTOR_AS_DEFAULT
-    BOOST_CONTAINER_FWD_BAD_DEQUE
-    HAS_UNCAUGHT_EXCEPTIONS=1)
-
-target_compile_options(xrpl.libxrpl
-  PUBLIC
-    $<$<BOOL:${is_gcc}>:-Wno-maybe-uninitialized>
-    $<$<BOOL:${voidstar}>:-DENABLE_VOIDSTAR>
-)
-
-target_link_libraries(xrpl.libxrpl
-  PUBLIC
-    LibArchive::LibArchive
-    OpenSSL::Crypto
-    Ripple::boost
-    Ripple::opts
-    Ripple::syslibs
-    absl::random_random
-    date::date
-    ed25519::ed25519
-    secp256k1::secp256k1
-    xrpl.libpb
-    xxHash::xxhash
-    $<$<BOOL:${voidstar}>:antithesis-sdk-cpp>
-)
-
 if(xrpld)
  add_executable(rippled)
-  if(unity)
-    set_target_properties(rippled PROPERTIES UNITY_BUILD ON)
-  endif()
  if(tests)
    target_compile_definitions(rippled PUBLIC ENABLE_TESTS)
+    target_compile_definitions(rippled PRIVATE
+                                       UNIT_TEST_REFERENCE_FEE=${UNIT_TEST_REFERENCE_FEE}
+    )
  endif()
  target_include_directories(rippled
    PRIVATE
--- a/cmake/RippledDocs.cmake
+++ b/cmake/RippledDocs.cmake
@@ -53,9 +53,9 @@ set(download_script "${CMAKE_BINARY_DIR}/docs/download-cppreference.cmake")
 file(WRITE
  "${download_script}"
  "file(DOWNLOAD \
-    http://upload.cppreference.com/mwiki/images/b/b2/html_book_20190607.zip \
+    https://github.com/PeterFeicht/cppreference-doc/releases/download/v20250209/html-book-20250209.zip \
    ${CMAKE_BINARY_DIR}/docs/cppreference.zip \
-    EXPECTED_HASH MD5=82b3a612d7d35a83e3cb1195a63689ab \
+    EXPECTED_HASH MD5=bda585f72fbca4b817b29a3d5746567b \
  )\n \
  execute_process( \
    COMMAND \"${CMAKE_COMMAND}\" -E tar -xf cppreference.zip \
--- a/cmake/RippledInterface.cmake
+++ b/cmake/RippledInterface.cmake
@@ -7,6 +7,9 @@ add_library (Ripple::opts ALIAS opts)
 target_compile_definitions (opts
  INTERFACE
    BOOST_ASIO_DISABLE_HANDLER_TYPE_REQUIREMENTS
+    BOOST_ASIO_USE_TS_EXECUTOR_AS_DEFAULT
+    BOOST_CONTAINER_FWD_BAD_DEQUE
+    HAS_UNCAUGHT_EXCEPTIONS=1
    $<$<BOOL:${boost_show_deprecated}>:
      BOOST_ASIO_NO_DEPRECATED
      BOOST_FILESYSTEM_NO_DEPRECATED
@@ -18,10 +21,12 @@ target_compile_definitions (opts
    >
    $<$<BOOL:${beast_no_unit_test_inline}>:BEAST_NO_UNIT_TEST_INLINE=1>
    $<$<BOOL:${beast_disable_autolink}>:BEAST_DONT_AUTOLINK_TO_WIN32_LIBRARIES=1>
-    $<$<BOOL:${single_io_service_thread}>:RIPPLE_SINGLE_IO_SERVICE_THREAD=1>)
+    $<$<BOOL:${single_io_service_thread}>:RIPPLE_SINGLE_IO_SERVICE_THREAD=1>
+    $<$<BOOL:${voidstar}>:ENABLE_VOIDSTAR>)
 target_compile_options (opts
  INTERFACE
    $<$<AND:$<BOOL:${is_gcc}>,$<COMPILE_LANGUAGE:CXX>>:-Wsuggest-override>
+    $<$<BOOL:${is_gcc}>:-Wno-maybe-uninitialized>
    $<$<BOOL:${perf}>:-fno-omit-frame-pointer>
    $<$<AND:$<BOOL:${is_gcc}>,$<BOOL:${coverage}>>:-g --coverage -fprofile-abs-path>
    $<$<AND:$<BOOL:${is_clang}>,$<BOOL:${coverage}>>:-g --coverage>
--- a/cmake/RippledSanity.cmake
+++ b/cmake/RippledSanity.cmake
@@ -2,16 +2,6 @@
   convenience variables and sanity checks
 #]===================================================================]

-include(ProcessorCount)
-
-if (NOT ep_procs)
-  ProcessorCount(ep_procs)
-  if (ep_procs GREATER 1)
-    # never use more than half of cores for EP builds
-    math (EXPR ep_procs "${ep_procs} / 2")
-    message (STATUS "Using ${ep_procs} cores for ExternalProject builds.")
-  endif ()
-endif ()
 get_property(is_multiconfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)

 set (CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE)
--- a/cmake/RippledSettings.cmake
+++ b/cmake/RippledSettings.cmake
@@ -11,12 +11,19 @@ option(assert "Enables asserts, even in release builds" OFF)
 option(xrpld "Build xrpld" ON)

 option(tests "Build tests" ON)
+if(tests)
+  # This setting allows making a separate workflow to test fees other than default 10
+  if(NOT UNIT_TEST_REFERENCE_FEE)
+    set(UNIT_TEST_REFERENCE_FEE "10" CACHE STRING "")
+  endif()
+endif()

 option(unity "Creates a build using UNITY support in cmake. This is the default" ON)
 if(unity)
  if(NOT is_ci)
    set(CMAKE_UNITY_BUILD_BATCH_SIZE 15 CACHE STRING "")
  endif()
+  set(CMAKE_UNITY_BUILD ON CACHE BOOL "Do a unity build")
 endif()
 if(is_clang AND is_linux)
  option(voidstar "Enable Antithesis instrumentation." OFF)
--- a/conanfile.py
+++ b/conanfile.py
@@ -1,4 +1,4 @@
-from conan import ConanFile
+from conan import ConanFile, __version__ as conan_version
 from conan.tools.cmake import CMake, CMakeToolchain, cmake_layout
 import re

@@ -24,14 +24,12 @@ class Xrpl(ConanFile):
    }

    requires = [
-        'date/3.0.1',
        'grpc/1.50.1',
-        'libarchive/3.6.2',
+        'libarchive/3.7.6',
        'nudb/2.0.8',
-        'openssl/1.1.1u',
+        'openssl/1.1.1v',
        'soci/4.0.3',
-        'xxhash/0.8.2',
-        'zlib/1.2.13',
+        'zlib/1.3.1',
    ]

    tool_requires = [
@@ -99,14 +97,18 @@ class Xrpl(ConanFile):
            self.options['boost'].visibility = 'global'

    def requirements(self):
-        self.requires('boost/1.82.0', force=True)
-        self.requires('lz4/1.9.3', force=True)
+        # Conan 2 requires transitive headers to be specified
+        transitive_headers_opt = {'transitive_headers': True} if conan_version.split('.')[0] == '2' else {}
+        self.requires('boost/1.83.0', force=True, **transitive_headers_opt)
+        self.requires('date/3.0.3', **transitive_headers_opt)
+        self.requires('lz4/1.10.0', force=True)
        self.requires('protobuf/3.21.9', force=True)
-        self.requires('sqlite3/3.42.0', force=True)
+        self.requires('sqlite3/3.47.0', force=True)
        if self.options.jemalloc:
            self.requires('jemalloc/5.3.0')
        if self.options.rocksdb:
-            self.requires('rocksdb/6.29.5')
+            self.requires('rocksdb/9.7.3')
+        self.requires('xxhash/0.8.2', **transitive_headers_opt)

    exports_sources = (
        'CMakeLists.txt',
--- a/docs/build/environment.md
+++ b/docs/build/environment.md
@@ -23,7 +23,7 @@ direction.

 ```
 apt update
-apt install --yes curl git libssl-dev python3.10-dev python3-pip make g++-11 libprotobuf-dev protobuf-compiler
+apt install --yes curl git libssl-dev pipx python3.10-dev python3-pip make g++-11 libprotobuf-dev protobuf-compiler

 curl --location --remote-name \
  "https://github.com/Kitware/CMake/releases/download/v3.25.1/cmake-3.25.1.tar.gz"
@@ -35,7 +35,8 @@ make --jobs $(nproc)
 make install
 cd ..

-pip3 install 'conan<2'
+pipx install 'conan<2'
+pipx ensurepath
 ```

 [1]: https://github.com/thejohnfreeman/rippled-docker/blob/master/ubuntu-22.04/install.sh
--- a/docs/consensus.md
+++ b/docs/consensus.md
@@ -558,7 +558,7 @@ struct ConsensusResult
    ConsensusTimer roundTime;

    // Indicates state in which consensus ended.  Once in the accept phase
-    // will be either Yes or MovedOn
+    // will be either Yes or MovedOn or Expired
    ConsensusState state = ConsensusState::No;
 };

--- a/external/blake3/CMakeLists.txt
+++ b/external/blake3/CMakeLists.txt
@@ -0,0 +1,383 @@
+cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
+
+# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD
+if (POLICY CMP0128)
+  cmake_policy(SET CMP0128 NEW)
+endif()
+# mark_as_advanced does not implicitly create UNINITIALIZED cache entries
+if (POLICY CMP0102)
+  cmake_policy(SET CMP0102 NEW)
+endif()
+
+project(libblake3
+  VERSION 1.8.2
+  DESCRIPTION "BLAKE3 C implementation"
+  LANGUAGES C CXX ASM
+)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+option(BLAKE3_USE_TBB "Enable oneTBB parallelism" OFF)
+option(BLAKE3_FETCH_TBB "Allow fetching oneTBB from GitHub if not found on system" OFF)
+
+include(CTest)
+include(FeatureSummary)
+include(GNUInstallDirs)
+
+add_subdirectory(dependencies)
+
+# architecture lists for which to enable assembly / SIMD sources
+set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
+set(BLAKE3_X86_NAMES i686 x86 X86)
+set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
+# default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
+if(MSVC)
+  set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
+  # MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170)
+  set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1")
+  set(BLAKE3_CFLAGS_AVX2 "/arch:AVX2" CACHE STRING "the compiler flags to enable AVX2")
+  set(BLAKE3_CFLAGS_AVX512 "/arch:AVX512" CACHE STRING "the compiler flags to enable AVX512")
+
+  set(BLAKE3_AMD64_ASM_SOURCES
+    blake3_avx2_x86-64_windows_msvc.asm
+    blake3_avx512_x86-64_windows_msvc.asm
+    blake3_sse2_x86-64_windows_msvc.asm
+    blake3_sse41_x86-64_windows_msvc.asm
+  )
+
+elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
+       OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
+       OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
+  set(BLAKE3_CFLAGS_SSE2 "-msse2" CACHE STRING "the compiler flags to enable SSE2")
+  set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
+  set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
+  set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
+
+  if (WIN32 OR CYGWIN)
+    set(BLAKE3_AMD64_ASM_SOURCES
+      blake3_avx2_x86-64_windows_gnu.S
+      blake3_avx512_x86-64_windows_gnu.S
+      blake3_sse2_x86-64_windows_gnu.S
+      blake3_sse41_x86-64_windows_gnu.S
+    )
+
+  elseif(UNIX)
+    set(BLAKE3_AMD64_ASM_SOURCES
+      blake3_avx2_x86-64_unix.S
+      blake3_avx512_x86-64_unix.S
+      blake3_sse2_x86-64_unix.S
+      blake3_sse41_x86-64_unix.S
+    )
+  endif()
+
+  if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
+      AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # 32-bit ARMv8 needs NEON to be enabled explicitly
+    set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON")
+  endif()
+endif()
+
+mark_as_advanced(BLAKE3_CFLAGS_SSE2 BLAKE3_CFLAGS_SSE4.1 BLAKE3_CFLAGS_AVX2 BLAKE3_CFLAGS_AVX512 BLAKE3_CFLAGS_NEON)
+mark_as_advanced(BLAKE3_AMD64_ASM_SOURCES)
+
+message(STATUS "BLAKE3 SIMD configuration: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}")
+if(MSVC AND DEFINED CMAKE_C_COMPILER_ARCHITECTURE_ID)
+  if(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]86")
+    set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+  elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]64")
+    set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use")
+
+  elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Aa][Rr][Mm]64")
+    set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+  else()
+    set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use")
+  endif()
+
+elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES)
+  set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use")
+
+elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES
+       AND DEFINED BLAKE3_CFLAGS_SSE2
+       AND DEFINED BLAKE3_CFLAGS_SSE4.1
+       AND DEFINED BLAKE3_CFLAGS_AVX2
+       AND DEFINED BLAKE3_CFLAGS_AVX512)
+  set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
+          OR ANDROID_ABI STREQUAL "armeabi-v7a"
+          OR BLAKE3_USE_NEON_INTRINSICS)
+        AND (DEFINED BLAKE3_CFLAGS_NEON
+          OR CMAKE_SIZEOF_VOID_P EQUAL 8))
+  set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+else()
+  set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use")
+endif()
+
+mark_as_advanced(BLAKE3_SIMD_TYPE)
+
+# library target
+add_library(blake3
+  blake3.c
+  blake3_dispatch.c
+  blake3_portable.c
+)
+add_library(BLAKE3::blake3 ALIAS blake3)
+
+# library configuration
+set(PKG_CONFIG_CFLAGS)
+if (BUILD_SHARED_LIBS)
+  target_compile_definitions(blake3
+    PUBLIC BLAKE3_DLL
+    PRIVATE BLAKE3_DLL_EXPORTS
+  )
+  list(APPEND PKG_CONFIG_CFLAGS -DBLAKE3_DLL)
+endif()
+target_include_directories(blake3 PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+set_target_properties(blake3 PROPERTIES
+  VERSION ${PROJECT_VERSION}
+  SOVERSION 0
+  C_VISIBILITY_PRESET hidden
+  C_EXTENSIONS OFF
+)
+target_compile_features(blake3 PUBLIC c_std_99)
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
+  target_compile_features(blake3 PUBLIC cxx_std_20)
+  # else: add it further below through `BLAKE3_CMAKE_CXXFLAGS_*`
+endif()
+
+# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD
+# which may be set by the user or toolchain file
+if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD)
+  set_target_properties(blake3 PROPERTIES C_STANDARD 99)
+endif()
+
+# optional SIMD sources
+if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm")
+  if (NOT DEFINED BLAKE3_AMD64_ASM_SOURCES)
+    message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'amd64-asm' but no assembly sources are available for the target architecture.")
+  endif()
+  set(BLAKE3_SIMD_AMD64_ASM ON)
+
+  if(MSVC)
+    enable_language(ASM_MASM)
+  endif()
+
+  target_sources(blake3 PRIVATE ${BLAKE3_AMD64_ASM_SOURCES})
+
+elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics")
+  if (NOT DEFINED BLAKE3_CFLAGS_SSE2
+      OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1
+      OR NOT DEFINED BLAKE3_CFLAGS_AVX2
+      OR NOT DEFINED BLAKE3_CFLAGS_AVX512)
+    message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.")
+  endif()
+  set(BLAKE3_SIMD_X86_INTRINSICS ON)
+
+  target_sources(blake3 PRIVATE
+    blake3_avx2.c
+    blake3_avx512.c
+    blake3_sse2.c
+    blake3_sse41.c
+  )
+  set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}")
+  set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}")
+  set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
+  set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")
+
+elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics")
+  set(BLAKE3_SIMD_NEON_INTRINSICS ON)
+
+  target_sources(blake3 PRIVATE
+    blake3_neon.c
+  )
+  target_compile_definitions(blake3 PRIVATE
+    BLAKE3_USE_NEON=1
+  )
+
+  if (DEFINED BLAKE3_CFLAGS_NEON)
+    set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
+  endif()
+
+elseif(BLAKE3_SIMD_TYPE STREQUAL "none")
+  target_compile_definitions(blake3 PRIVATE
+    BLAKE3_USE_NEON=0
+    BLAKE3_NO_SSE2
+    BLAKE3_NO_SSE41
+    BLAKE3_NO_AVX2
+    BLAKE3_NO_AVX512
+  )
+
+else()
+  message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to an unknown value: '${BLAKE3_SIMD_TYPE}'")
+endif()
+
+if(BLAKE3_USE_TBB)
+  find_package(TBB 2021.11.0 QUIET)
+  if(NOT TBB_FOUND AND NOT TARGET TBB::tbb)
+    message(WARNING
+      "oneTBB not found; disabling BLAKE3_USE_TBB\n"
+      "Enable BLAKE3_FETCH_TBB to automatically fetch and build oneTBB"
+    )
+    set(BLAKE3_USE_TBB OFF)
+  else()
+    target_sources(blake3
+      PRIVATE
+        blake3_tbb.cpp)
+    target_link_libraries(blake3
+      PUBLIC
+        # Make shared TBB a transitive dependency. The consuming program is technically not required
+        # to link TBB in order for libblake3 to function but we do this in order to prevent the
+        # possibility of multiple separate TBB runtimes being linked into a final program in case
+        # the consuming program also happens to already use TBB.
+        TBB::tbb)
+    target_compile_definitions(blake3
+      PUBLIC
+        BLAKE3_USE_TBB)
+  endif()
+  list(APPEND PKG_CONFIG_REQUIRES "tbb >= ${TBB_VERSION}")
+  list(APPEND PKG_CONFIG_CFLAGS -DBLAKE3_USE_TBB)
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(_LIBCPP_VERSION "version" BLAKE3_HAVE_LIBCPP)
+  check_cxx_symbol_exists(__GLIBCXX__ "version" BLAKE3_HAVE_GLIBCXX)
+  if(BLAKE3_HAVE_GLIBCXX)
+    list(APPEND PKG_CONFIG_LIBS -lstdc++)
+  elseif(BLAKE3_HAVE_LIBCPP)
+    list(APPEND PKG_CONFIG_LIBS -lc++)
+  endif()
+endif()
+
+if(BLAKE3_USE_TBB)
+  # Define some scratch variables for building appropriate flags per compiler
+  if(CMAKE_VERSION VERSION_LESS 3.12)
+    set(APPEND BLAKE3_CXX_STANDARD_FLAGS_GNU -std=c++20)
+    set(APPEND BLAKE3_CXX_STANDARD_FLAGS_MSVC /std:c++20)
+  endif()
+  set(BLAKE3_CXXFLAGS_GNU "-fno-exceptions;-fno-rtti;${BLAKE3_CXX_STANDARD_FLAGS_GNU}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with GNU-like compiler frontends.")
+  set(BLAKE3_CXXFLAGS_MSVC "/EHs-c-;/GR-;${BLAKE3_CXX_STANDARD_FLAGS_MSVC}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with MSVC-like compiler frontends.")
+  # Get the C++ compiler name without extension
+  get_filename_component(BLAKE3_CMAKE_CXX_COMPILER_NAME "${CMAKE_CXX_COMPILER}" NAME_WE)
+  # Strip any trailing versioning from the C++ compiler name
+  string(REGEX MATCH "^(clang\\+\\+|clang-cl)" BLAKE3_CMAKE_CXX_COMPILER_NAME "${BLAKE3_CMAKE_CXX_COMPILER_NAME}")
+
+  # TODO: Simplify with CMAKE_CXX_COMPILER_FRONTEND_VARIANT once min CMake version is 3.14.
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+    target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>)
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    if(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang++")
+      target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>)
+    elseif(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang-cl")
+      target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_MSVC}>)
+    endif()
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>)
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_MSVC}>)
+  endif()
+
+  # Undefine scratch variables
+  unset(BLAKE3_CXX_STANDARD_FLAGS_GNU)
+  unset(BLAKE3_CXX_STANDARD_FLAGS_MSVC)
+  unset(BLAKE3_CMAKE_CXX_COMPILER_NAME)
+  unset(BLAKE3_CXXFLAGS_GNU)
+  unset(BLAKE3_CXXFLAGS_MSVC)
+endif()
+
+# cmake install support
+install(FILES blake3.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+install(TARGETS blake3 EXPORT blake3-targets
+  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+)
+install(EXPORT blake3-targets
+  NAMESPACE BLAKE3::
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3"
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(blake3-config.cmake.in
+    "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake"
+
+    INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3"
+)
+write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake"
+    VERSION ${libblake3_VERSION}
+    COMPATIBILITY SameMajorVersion
+)
+install(FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3"
+)
+
+# Function for joining paths known from most languages
+#
+# SPDX-License-Identifier: (MIT OR CC0-1.0)
+# Copyright 2020 Jan Tojnar
+# https://github.com/jtojnar/cmake-snips
+#
+# Modelled after Python’s os.path.join
+# https://docs.python.org/3.7/library/os.path.html#os.path.join
+# Windows not supported
+function(join_paths joined_path first_path_segment)
+    set(temp_path "${first_path_segment}")
+    foreach(current_segment IN LISTS ARGN)
+        if(NOT ("${current_segment}" STREQUAL ""))
+            if(IS_ABSOLUTE "${current_segment}")
+                set(temp_path "${current_segment}")
+            else()
+                set(temp_path "${temp_path}/${current_segment}")
+            endif()
+        endif()
+    endforeach()
+    set(${joined_path} "${temp_path}" PARENT_SCOPE)
+endfunction()
+
+# In-place rewrite a string and and join by `sep`.
+#
+# TODO: Replace function with list(JOIN) when updating to CMake 3.12
+function(join_pkg_config_field sep requires)
+  set(_requires "${${requires}}") # avoid shadowing issues, e.g. "${requires}"=len
+  list(LENGTH "${requires}" len)
+  set(idx 1)
+  foreach(req IN LISTS _requires)
+    string(APPEND acc "${req}")
+    if(idx LESS len)
+      string(APPEND acc "${sep}")
+    endif()
+    math(EXPR idx "${idx} + 1")
+  endforeach()
+  set("${requires}" "${acc}" PARENT_SCOPE)
+endfunction()
+
+# pkg-config support
+join_pkg_config_field(", " PKG_CONFIG_REQUIRES)
+join_pkg_config_field(" " PKG_CONFIG_LIBS)
+join_pkg_config_field(" " PKG_CONFIG_CFLAGS)
+join_paths(PKG_CONFIG_INSTALL_LIBDIR "\${prefix}" "${CMAKE_INSTALL_LIBDIR}")
+join_paths(PKG_CONFIG_INSTALL_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
+configure_file(libblake3.pc.in libblake3.pc @ONLY)
+install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+# print feature summary
+# add_feature_info cannot directly use the BLAKE3_SIMD_TYPE :(
+add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.")
+add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.")
+add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.")
+add_feature_info("oneTBB parallelism" BLAKE3_USE_TBB "The library uses oneTBB parallelism.")
+feature_summary(WHAT ENABLED_FEATURES)
+
+if(BLAKE3_EXAMPLES)
+  include(BLAKE3/Examples)
+endif()
+if(BLAKE3_TESTING)
+  include(BLAKE3/Testing)
+endif()
--- a/external/blake3/Makefile.testing
+++ b/external/blake3/Makefile.testing
@@ -0,0 +1,82 @@
+# This Makefile is only for testing. C callers should follow the instructions
+# in ./README.md to incorporate these C files into their existing build.
+
+NAME=blake3
+CC=gcc
+CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden
+LDFLAGS=-pie -Wl,-z,relro,-z,now
+TARGETS=
+ASM_TARGETS=
+EXTRAFLAGS=-Wa,--noexecstack
+
+ifdef BLAKE3_NO_SSE2
+EXTRAFLAGS += -DBLAKE3_NO_SSE2
+else
+TARGETS += blake3_sse2.o
+ASM_TARGETS += blake3_sse2_x86-64_unix.S
+endif
+
+ifdef BLAKE3_NO_SSE41
+EXTRAFLAGS += -DBLAKE3_NO_SSE41
+else
+TARGETS += blake3_sse41.o
+ASM_TARGETS += blake3_sse41_x86-64_unix.S
+endif
+
+ifdef BLAKE3_NO_AVX2
+EXTRAFLAGS += -DBLAKE3_NO_AVX2
+else
+TARGETS += blake3_avx2.o
+ASM_TARGETS += blake3_avx2_x86-64_unix.S
+endif
+
+ifdef BLAKE3_NO_AVX512
+EXTRAFLAGS += -DBLAKE3_NO_AVX512
+else
+TARGETS += blake3_avx512.o
+ASM_TARGETS += blake3_avx512_x86-64_unix.S
+endif
+
+ifdef BLAKE3_USE_NEON
+EXTRAFLAGS += -DBLAKE3_USE_NEON=1
+TARGETS += blake3_neon.o
+endif
+
+ifdef BLAKE3_NO_NEON
+EXTRAFLAGS += -DBLAKE3_USE_NEON=0
+endif
+
+all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS)
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
+
+blake3_sse2.o: blake3_sse2.c
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2
+
+blake3_sse41.o: blake3_sse41.c
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1
+
+blake3_avx2.o: blake3_avx2.c
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2
+
+blake3_avx512.o: blake3_avx512.c
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl
+
+blake3_neon.o: blake3_neon.c
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@
+
+test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined
+test: all
+	./test.py
+
+asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS)
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
+
+test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined 
+test_asm: asm
+	./test.py
+
+example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS)
+	$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS)
+
+clean: 
+	rm -f $(NAME) *.o
--- a/external/blake3/README.md
+++ b/external/blake3/README.md
@@ -0,0 +1,403 @@
+The official C implementation of BLAKE3.
+
+# Example
+
+An example program that hashes bytes from standard input and prints the
+result:
+
+```c
+#include "blake3.h"
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(void) {
+  // Initialize the hasher.
+  blake3_hasher hasher;
+  blake3_hasher_init(&hasher);
+
+  // Read input bytes from stdin.
+  unsigned char buf[65536];
+  while (1) {
+    ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
+    if (n > 0) {
+      blake3_hasher_update(&hasher, buf, n);
+    } else if (n == 0) {
+      break; // end of file
+    } else {
+      fprintf(stderr, "read failed: %s\n", strerror(errno));
+      return 1;
+    }
+  }
+
+  // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
+  uint8_t output[BLAKE3_OUT_LEN];
+  blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+
+  // Print the hash as hexadecimal.
+  for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
+    printf("%02x", output[i]);
+  }
+  printf("\n");
+  return 0;
+}
+```
+
+The code above is included in this directory as `example.c`. If you're
+on x86\_64 with a Unix-like OS, you can compile a working binary like
+this:
+
+```bash
+gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
+    blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
+    blake3_avx512_x86-64_unix.S
+```
+
+# API
+
+## The Struct
+
+```c
+typedef struct {
+  // private fields
+} blake3_hasher;
+```
+
+An incremental BLAKE3 hashing state, which can accept any number of
+updates. This implementation doesn't allocate any heap memory, but
+`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes
+on x86-64. This size can be reduced by restricting the maximum input
+length, as described in Section 5.4 of [the BLAKE3
+spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
+but this implementation doesn't currently support that strategy.
+
+## Common API Functions
+
+```c
+void blake3_hasher_init(
+  blake3_hasher *self);
+```
+
+Initialize a `blake3_hasher` in the default hashing mode.
+
+---
+
+```c
+void blake3_hasher_update(
+  blake3_hasher *self,
+  const void *input,
+  size_t input_len);
+```
+
+Add input to the hasher. This can be called any number of times. This function
+is always single-threaded; for multithreading see `blake3_hasher_update_tbb`
+below.
+
+
+---
+
+```c
+void blake3_hasher_finalize(
+  const blake3_hasher *self,
+  uint8_t *out,
+  size_t out_len);
+```
+
+Finalize the hasher and return an output of any length, given in bytes.
+This doesn't modify the hasher itself, and it's possible to finalize
+again after adding more input. The constant `BLAKE3_OUT_LEN` provides
+the default output length, 32 bytes, which is recommended for most
+callers. See the [Security Notes](#security-notes) below.
+
+## Less Common API Functions
+
+```c
+void blake3_hasher_init_keyed(
+  blake3_hasher *self,
+  const uint8_t key[BLAKE3_KEY_LEN]);
+```
+
+Initialize a `blake3_hasher` in the keyed hashing mode. The key must be
+exactly 32 bytes.
+
+---
+
+```c
+void blake3_hasher_init_derive_key(
+  blake3_hasher *self,
+  const char *context);
+```
+
+Initialize a `blake3_hasher` in the key derivation mode. The context
+string is given as an initialization parameter, and afterwards input key
+material should be given with `blake3_hasher_update`. The context string
+is a null-terminated C string which should be **hardcoded, globally
+unique, and application-specific**. The context string should not
+include any dynamic input like salts, nonces, or identifiers read from a
+database at runtime. A good default format for the context string is
+`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
+2019-12-25 16:18:03 session tokens v1"`.
+
+This function is intended for application code written in C. For
+language bindings, see `blake3_hasher_init_derive_key_raw` below.
+
+---
+
+```c
+void blake3_hasher_init_derive_key_raw(
+  blake3_hasher *self,
+  const void *context,
+  size_t context_len);
+```
+
+As `blake3_hasher_init_derive_key` above, except that the context string
+is given as a pointer to an array of arbitrary bytes with a provided
+length. This is intended for writing language bindings, where C string
+conversion would add unnecessary overhead and new error cases. Unicode
+strings should be encoded as UTF-8.
+
+Application code in C should prefer `blake3_hasher_init_derive_key`,
+which takes the context as a C string. If you need to use arbitrary
+bytes as a context string in application code, consider whether you're
+violating the requirement that context strings should be hardcoded.
+
+---
+
+```c
+void blake3_hasher_update_tbb(
+  blake3_hasher *self,
+  const void *input,
+  size_t input_len);
+```
+
+Add input to the hasher, using [oneTBB] to process large inputs using multiple
+threads. This can be called any number of times. This gives the same result as
+`blake3_hasher_update` above.
+
+[oneTBB]: https://uxlfoundation.github.io/oneTBB/
+
+NOTE: This function is only enabled when the library is compiled with CMake option `BLAKE3_USE_TBB`
+and when the oneTBB library is detected on the host system. See the building instructions for
+further details.
+
+To get any performance benefit from multithreading, the input buffer needs to
+be large. As a rule of thumb on x86_64, `blake3_hasher_update_tbb` is _slower_
+than `blake3_hasher_update` for inputs under 128 KiB. That threshold varies
+quite a lot across different processors, and it's important to benchmark your
+specific use case.
+
+Hashing large files with this function usually requires
+[memory-mapping](https://en.wikipedia.org/wiki/Memory-mapped_file), since
+reading a file into memory in a single-threaded loop takes longer than hashing
+the resulting buffer. Note that hashing a memory-mapped file with this function
+produces a "random" pattern of disk reads, which can be slow on spinning disks.
+Again it's important to benchmark your specific use case.
+
+This implementation doesn't require configuration of thread resources and will
+use as many cores as possible by default. More fine-grained control of
+resources is possible using the [oneTBB] API.
+
+---
+
+```c
+void blake3_hasher_finalize_seek(
+  const blake3_hasher *self,
+  uint64_t seek,
+  uint8_t *out,
+  size_t out_len);
+```
+
+The same as `blake3_hasher_finalize`, but with an additional `seek`
+parameter for the starting byte position in the output stream. To
+efficiently stream a large output without allocating memory, call this
+function in a loop, incrementing `seek` by the output length each time.
+
+---
+
+```c
+void blake3_hasher_reset(
+  blake3_hasher *self);
+```
+
+Reset the hasher to its initial state, prior to any calls to
+`blake3_hasher_update`. Currently this is no different from calling
+`blake3_hasher_init` or similar again.
+
+# Security Notes
+
+Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit
+BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2
+bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional
+security.
+
+Avoid relying on the secrecy of the output offset, that is, the `seek` argument of
+`blake3_hasher_finalize_seek`. [_Block-Cipher-Based Tree Hashing_ by Aldo
+Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows both the message
+and the key (if any) can easily determine the offset of an extended output. For comparison,
+AES-CTR has a similar property: if you know the key, you can decrypt a block from an unknown
+position in the output stream to recover its block index. Callers with strong secret keys
+aren't affected in practice, but secret offsets are a [design
+smell](https://en.wikipedia.org/wiki/Design_smell) in any case.
+
+# Building
+
+The easiest and most complete method of compiling this library is with CMake.
+This is the method described in the next section. Toward the end of the
+building section there are more in depth notes about compiling manually and
+things that are useful to understand if you need to integrate this library with
+another build system.
+
+## CMake
+
+The minimum version of CMake is 3.9. The following invocations will compile and
+install `libblake3`. With recent CMake:
+
+```bash
+cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local"
+cmake --build c/build --target install
+```
+
+With an older CMake:
+
+```bash
+cd c
+mkdir build
+cd build
+cmake .. "-DCMAKE_INSTALL_PREFIX=/usr/local"
+cmake --build . --target install
+```
+
+The following options are available when compiling with CMake:
+
+- `BLAKE3_USE_TBB`: Enable oneTBB parallelism (Requires a C++20 capable compiler)
+- `BLAKE3_FETCH_TBB`: Allow fetching oneTBB from GitHub (only if not found on system)
+- `BLAKE3_EXAMPLES`: Compile and install example programs
+
+Options can be enabled like this:
+
+```bash
+cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" -DBLAKE3_USE_TBB=1 -DBLAKE3_FETCH_TBB=1
+```
+
+## Building manually
+
+We try to keep the build simple enough that you can compile this library "by
+hand", and it's expected that many callers will integrate it with their
+pre-existing build systems. See the `gcc` one-liner in the "Example" section
+above.
+
+### x86
+
+Dynamic dispatch is enabled by default on x86. The implementation will
+query the CPU at runtime to detect SIMD support, and it will use the
+widest instruction set available. By default, `blake3_dispatch.c`
+expects to be linked with code for five different instruction sets:
+portable C, SSE2, SSE4.1, AVX2, and AVX-512.
+
+For each of the x86 SIMD instruction sets, four versions are available:
+three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
+version using C intrinsics. The assembly versions are generally
+preferred. They perform better, they perform more consistently across
+different compilers, and they build more quickly. On the other hand, the
+assembly versions are x86\_64-only, and you need to select the right
+flavor for your target platform.
+
+Here's an example of building a shared library on x86\_64 Linux using
+the assembly implementations:
+
+```bash
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
+    blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
+    blake3_avx512_x86-64_unix.S
+```
+
+When building the intrinsics-based implementations, you need to build
+each implementation separately, with the corresponding instruction set
+explicitly enabled in the compiler. Here's the same shared library using
+the intrinsics-based implementations:
+
+```bash
+gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
+gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
+gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
+gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
+    blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
+```
+
+Note above that building `blake3_avx512.c` requires both `-mavx512f` and
+`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512`
+flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`.
+MSVC enables SSE2 and SSE4.1 by default, and it doesn't have a
+corresponding flag.
+
+If you want to omit SIMD code entirely, you need to explicitly disable
+each instruction set. Here's an example of building a shared library on
+x86 with only portable code:
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
+    -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
+```
+
+### ARM NEON
+
+The NEON implementation is enabled by default on AArch64, but not on
+other ARM targets, since not all of them support it. To enable it, set
+`BLAKE3_USE_NEON=1`. Here's an example of building a shared library on
+ARM Linux with NEON support:
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \
+    blake3_portable.c blake3_neon.c
+```
+
+To explicitiy disable using NEON instructions on AArch64, set
+`BLAKE3_USE_NEON=0`.
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \
+    blake3_portable.c 
+```
+
+Note that on some targets (ARMv7 in particular), extra flags may be
+required to activate NEON support in the compiler. If you see an error
+like...
+
+```
+/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed
+in call to always_inline ‘vaddq_u32’: target specific option mismatch
+```
+
+...then you may need to add something like `-mfpu=neon-vfpv4
+-mfloat-abi=hard`.
+
+### Other Platforms
+
+The portable implementation should work on most other architectures. For
+example:
+
+```bash
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c
+```
+
+### Multithreading
+
+Multithreading is available using [oneTBB], by compiling the optional C++
+support file [`blake3_tbb.cpp`](./blake3_tbb.cpp). For an example of using
+`mmap` (non-Windows) and `blake3_hasher_update_tbb` to get large-file
+performance on par with [`b3sum`](../b3sum), see
+[`example_tbb.c`](./example_tbb.c). You can build it like this:
+
+```bash
+g++ -c -O3 -fno-exceptions -fno-rtti -DBLAKE3_USE_TBB -o blake3_tbb.o blake3_tbb.cpp
+gcc -O3 -o example_tbb -lstdc++ -ltbb -DBLAKE3_USE_TBB blake3_tbb.o example_tbb.c blake3.c \
+    blake3_dispatch.c blake3_portable.c blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S \
+    blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S
+```
+
+NOTE: `-fno-exceptions` or equivalent is required to compile `blake3_tbb.cpp`,
+and public API methods with external C linkage are marked `noexcept`. Compiling
+that file with exceptions enabled will fail. Compiling with RTTI disabled isn't
+required but is recommended for code size.
--- a/external/blake3/blake3-config.cmake.in
+++ b/external/blake3/blake3-config.cmake.in
@@ -0,0 +1,14 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
+# Remember TBB option state
+set(BLAKE3_USE_TBB @BLAKE3_USE_TBB@)
+
+if(BLAKE3_USE_TBB)
+    find_dependency(TBB @TBB_VERSION@)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/blake3-targets.cmake")
+
+check_required_components(blake3)
--- a/external/blake3/blake3.c
+++ b/external/blake3/blake3.c
@@ -0,0 +1,650 @@
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "blake3.h"
+#include "blake3_impl.h"
+
+const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
+
+INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
+                             uint8_t flags) {
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+  self->blocks_compressed = 0;
+  self->flags = flags;
+}
+
+INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
+                              uint64_t chunk_counter) {
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = chunk_counter;
+  self->blocks_compressed = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+}
+
+INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
+  return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
+         ((size_t)self->buf_len);
+}
+
+INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
+                                   const uint8_t *input, size_t input_len) {
+  size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
+  if (take > input_len) {
+    take = input_len;
+  }
+  uint8_t *dest = self->buf + ((size_t)self->buf_len);
+  memcpy(dest, input, take);
+  self->buf_len += (uint8_t)take;
+  return take;
+}
+
+INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
+  if (self->blocks_compressed == 0) {
+    return CHUNK_START;
+  } else {
+    return 0;
+  }
+}
+
+typedef struct {
+  uint32_t input_cv[8];
+  uint64_t counter;
+  uint8_t block[BLAKE3_BLOCK_LEN];
+  uint8_t block_len;
+  uint8_t flags;
+} output_t;
+
+INLINE output_t make_output(const uint32_t input_cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter,
+                            uint8_t flags) {
+  output_t ret;
+  memcpy(ret.input_cv, input_cv, 32);
+  memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+  ret.block_len = block_len;
+  ret.counter = counter;
+  ret.flags = flags;
+  return ret;
+}
+
+// Chaining values within a given chunk (specifically the compress_in_place
+// interface) are represented as words. This avoids unnecessary bytes<->words
+// conversion overhead in the portable implementation. However, the hash_many
+// interface handles both user input and parent node blocks, so it accepts
+// bytes. For that reason, chaining values in the CV stack are represented as
+// bytes.
+INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
+  uint32_t cv_words[8];
+  memcpy(cv_words, self->input_cv, 32);
+  blake3_compress_in_place(cv_words, self->block, self->block_len,
+                           self->counter, self->flags);
+  store_cv_words(cv, cv_words);
+}
+
+INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
+                              size_t out_len) {
+  if (out_len == 0) {
+      return;
+  }
+  uint64_t output_block_counter = seek / 64;
+  size_t offset_within_block = seek % 64;
+  uint8_t wide_buf[64];
+  if(offset_within_block) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    const size_t available_bytes = 64 - offset_within_block;
+    const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
+    memcpy(out, wide_buf + offset_within_block, bytes);
+    out += bytes;
+    out_len -= bytes;
+    output_block_counter += 1;
+  }
+  if(out_len / 64) {
+    blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
+  }
+  output_block_counter += out_len / 64;
+  out += out_len & -64;
+  out_len -= out_len & -64;
+  if(out_len) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    memcpy(out, wide_buf, out_len);
+  }
+}
+
+INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
+                               size_t input_len) {
+  if (self->buf_len > 0) {
+    size_t take = chunk_state_fill_buf(self, input, input_len);
+    input += take;
+    input_len -= take;
+    if (input_len > 0) {
+      blake3_compress_in_place(
+          self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
+          self->flags | chunk_state_maybe_start_flag(self));
+      self->blocks_compressed += 1;
+      self->buf_len = 0;
+      memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+    }
+  }
+
+  while (input_len > BLAKE3_BLOCK_LEN) {
+    blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
+                             self->chunk_counter,
+                             self->flags | chunk_state_maybe_start_flag(self));
+    self->blocks_compressed += 1;
+    input += BLAKE3_BLOCK_LEN;
+    input_len -= BLAKE3_BLOCK_LEN;
+  }
+
+  chunk_state_fill_buf(self, input, input_len);
+}
+
+INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
+  uint8_t block_flags =
+      self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
+  return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
+                     block_flags);
+}
+
+INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
+                              const uint32_t key[8], uint8_t flags) {
+  return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
+}
+
+// Given some input larger than one chunk, return the number of bytes that
+// should go in the left subtree. This is the largest power-of-2 number of
+// chunks that leaves at least 1 byte for the right subtree.
+INLINE size_t left_subtree_len(size_t input_len) {
+  // Subtract 1 to reserve at least one byte for the right side. input_len
+  // should always be greater than BLAKE3_CHUNK_LEN.
+  size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
+  return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+// on a single thread. Write out the chunk chaining values and return the
+// number of chunks hashed. These chunks are never the root and never empty;
+// those cases use a different codepath.
+INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
+                                       const uint32_t key[8],
+                                       uint64_t chunk_counter, uint8_t flags,
+                                       uint8_t *out) {
+#if defined(BLAKE3_TESTING)
+  assert(0 < input_len);
+  assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
+#endif
+
+  const uint8_t *chunks_array[MAX_SIMD_DEGREE];
+  size_t input_position = 0;
+  size_t chunks_array_len = 0;
+  while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+    chunks_array[chunks_array_len] = &input[input_position];
+    input_position += BLAKE3_CHUNK_LEN;
+    chunks_array_len += 1;
+  }
+
+  blake3_hash_many(chunks_array, chunks_array_len,
+                   BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
+                   true, flags, CHUNK_START, CHUNK_END, out);
+
+  // Hash the remaining partial chunk, if there is one. Note that the empty
+  // chunk (meaning the empty message) is a different codepath.
+  if (input_len > input_position) {
+    uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
+    blake3_chunk_state chunk_state;
+    chunk_state_init(&chunk_state, key, flags);
+    chunk_state.chunk_counter = counter;
+    chunk_state_update(&chunk_state, &input[input_position],
+                       input_len - input_position);
+    output_t output = chunk_state_output(&chunk_state);
+    output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
+    return chunks_array_len + 1;
+  } else {
+    return chunks_array_len;
+  }
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+// on a single thread. Write out the parent chaining values and return the
+// number of parents hashed. (If there's an odd input chaining value left over,
+// return it as an additional output.) These parents are never the root and
+// never empty; those cases use a different codepath.
+INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
+                                        size_t num_chaining_values,
+                                        const uint32_t key[8], uint8_t flags,
+                                        uint8_t *out) {
+#if defined(BLAKE3_TESTING)
+  assert(2 <= num_chaining_values);
+  assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
+#endif
+
+  const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
+  size_t parents_array_len = 0;
+  while (num_chaining_values - (2 * parents_array_len) >= 2) {
+    parents_array[parents_array_len] =
+        &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
+    parents_array_len += 1;
+  }
+
+  blake3_hash_many(parents_array, parents_array_len, 1, key,
+                   0, // Parents always use counter 0.
+                   false, flags | PARENT,
+                   0, // Parents have no start flags.
+                   0, // Parents have no end flags.
+                   out);
+
+  // If there's an odd child left over, it becomes an output.
+  if (num_chaining_values > 2 * parents_array_len) {
+    memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
+           &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
+           BLAKE3_OUT_LEN);
+    return parents_array_len + 1;
+  } else {
+    return parents_array_len;
+  }
+}
+
+// The wide helper function returns (writes out) an array of chaining values
+// and returns the length of that array. The number of chaining values returned
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// if the input is shorter than that many chunks. The reason for maintaining a
+// wide array of chaining values going back up the tree, is to allow the
+// implementation to hash as many parents in parallel as possible.
+//
+// As a special case when the SIMD degree is 1, this function will still return
+// at least 2 outputs. This guarantees that this function doesn't perform the
+// root compression. (If it did, it would use the wrong flags, and also we
+// wouldn't be able to implement extendable output.) Note that this function is
+// not used when the whole input is only 1 chunk long; that's a different
+// codepath.
+//
+// Why not just have the caller split the input on the first update(), instead
+// of implementing this special rule? Because we don't want to limit SIMD or
+// multi-threading parallelism for that update().
+size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+                                    const uint32_t key[8],
+                                    uint64_t chunk_counter, uint8_t flags,
+                                    uint8_t *out, bool use_tbb) {
+  // Note that the single chunk case does *not* bump the SIMD degree up to 2
+  // when it is 1. If this implementation adds multi-threading in the future,
+  // this gives us the option of multi-threading even the 2-chunk case, which
+  // can help performance on smaller platforms.
+  if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
+    return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
+                                    out);
+  }
+
+  // With more than simd_degree chunks, we need to recurse. Start by dividing
+  // the input into left and right subtrees. (Note that this is only optimal
+  // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
+  // of 3 or something, we'll need a more complicated strategy.)
+  size_t left_input_len = left_subtree_len(input_len);
+  size_t right_input_len = input_len - left_input_len;
+  const uint8_t *right_input = &input[left_input_len];
+  uint64_t right_chunk_counter =
+      chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+  // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
+  // account for the special case of returning 2 outputs when the SIMD degree
+  // is 1.
+  uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+  size_t degree = blake3_simd_degree();
+  if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+    // The special case: We always use a degree of at least two, to make
+    // sure there are two outputs. Except, as noted above, at the chunk
+    // level, where we allow degree=1. (Note that the 1-chunk-input case is
+    // a different codepath.)
+    degree = 2;
+  }
+  uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+  // Recurse!
+  size_t left_n = -1;
+  size_t right_n = -1;
+
+#if defined(BLAKE3_USE_TBB)
+  blake3_compress_subtree_wide_join_tbb(
+      key, flags, use_tbb,
+      // left-hand side
+      input, left_input_len, chunk_counter, cv_array, &left_n,
+      // right-hand side
+      right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
+#else
+  left_n = blake3_compress_subtree_wide(
+      input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
+  right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
+                                         right_chunk_counter, flags, right_cvs,
+                                         use_tbb);
+#endif // BLAKE3_USE_TBB
+
+  // The special case again. If simd_degree=1, then we'll have left_n=1 and
+  // right_n=1. Rather than compressing them into a single output, return
+  // them directly, to make sure we always have at least two outputs.
+  if (left_n == 1) {
+    memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+    return 2;
+  }
+
+  // Otherwise, do one layer of parent node compression.
+  size_t num_chaining_values = left_n + right_n;
+  return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
+                                   out);
+}
+
+// Hash a subtree with compress_subtree_wide(), and then condense the resulting
+// list of chaining values down to a single parent node. Don't compress that
+// last parent node, however. Instead, return its message bytes (the
+// concatenated chaining values of its children). This is necessary when the
+// first call to update() supplies a complete subtree, because the topmost
+// parent node of that subtree could end up being the root. It's also necessary
+// for extended output in the general case.
+//
+// As with compress_subtree_wide(), this function is not used on inputs of 1
+// chunk or less. That's a different codepath.
+INLINE void
+compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
+                                const uint32_t key[8], uint64_t chunk_counter,
+                                uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
+                                bool use_tbb) {
+#if defined(BLAKE3_TESTING)
+  assert(input_len > BLAKE3_CHUNK_LEN);
+#endif
+
+  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
+                                                chunk_counter, flags, cv_array, use_tbb);
+  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
+  // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+  // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+  // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+  // set then it emits incorrect warnings here. We tried a few different
+  // hacks to silence these, but in the end our hacks just produced different
+  // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+  // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+  // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
+  // compress_subtree_wide() returns more than 2 chaining values. Condense
+  // them into 2 by forming parent nodes repeatedly.
+  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+  while (num_cvs > 2) {
+    num_cvs =
+        compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
+    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+  }
+#endif
+  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
+                             uint8_t flags) {
+  memcpy(self->key, key, BLAKE3_KEY_LEN);
+  chunk_state_init(&self->chunk, key, flags);
+  self->cv_stack_len = 0;
+}
+
+void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
+
+void blake3_hasher_init_keyed(blake3_hasher *self,
+                              const uint8_t key[BLAKE3_KEY_LEN]) {
+  uint32_t key_words[8];
+  load_key_words(key, key_words);
+  hasher_init_base(self, key_words, KEYED_HASH);
+}
+
+void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+                                       size_t context_len) {
+  blake3_hasher context_hasher;
+  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
+  blake3_hasher_update(&context_hasher, context, context_len);
+  uint8_t context_key[BLAKE3_KEY_LEN];
+  blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
+  uint32_t context_key_words[8];
+  load_key_words(context_key, context_key_words);
+  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
+}
+
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
+  blake3_hasher_init_derive_key_raw(self, context, strlen(context));
+}
+
+// As described in hasher_push_cv() below, we do "lazy merging", delaying
+// merges until right before the next CV is about to be added. This is
+// different from the reference implementation. Another difference is that we
+// aren't always merging 1 chunk at a time. Instead, each CV might represent
+// any power-of-two number of chunks, as long as the smaller-above-larger stack
+// order is maintained. Instead of the "count the trailing 0-bits" algorithm
+// described in the spec, we use a "count the total number of 1-bits" variant
+// that doesn't require us to retain the subtree size of the CV on top of the
+// stack. The principle is the same: each CV that should remain in the stack is
+// represented by a 1-bit in the total number of chunks (or bytes) so far.
+INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
+  size_t post_merge_stack_len = (size_t)popcnt(total_len);
+  while (self->cv_stack_len > post_merge_stack_len) {
+    uint8_t *parent_node =
+        &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+    output_t output = parent_output(parent_node, self->key, self->chunk.flags);
+    output_chaining_value(&output, parent_node);
+    self->cv_stack_len -= 1;
+  }
+}
+
+// In reference_impl.rs, we merge the new CV with existing CVs from the stack
+// before pushing it. We can do that because we know more input is coming, so
+// we know none of the merges are root.
+//
+// This setting is different. We want to feed as much input as possible to
+// compress_subtree_wide(), without setting aside anything for the chunk_state.
+// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+// as a single subtree, if at all possible.
+//
+// This leads to two problems:
+// 1) This 64 KiB input might be the only call that ever gets made to update.
+//    In this case, the root node of the 64 KiB subtree would be the root node
+//    of the whole tree, and it would need to be ROOT finalized. We can't
+//    compress it until we know.
+// 2) This 64 KiB input might complete a larger tree, whose root node is
+//    similarly going to be the root of the whole tree. For example, maybe
+//    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+//    node at the root of the 256 KiB subtree until we know how to finalize it.
+//
+// The second problem is solved with "lazy merging". That is, when we're about
+// to add a CV to the stack, we don't merge it with anything first, as the
+// reference impl does. Instead we do merges using the *previous* CV that was
+// added, which is sitting on top of the stack, and we put the new CV
+// (unmerged) on top of the stack afterwards. This guarantees that we never
+// merge the root node until finalize().
+//
+// Solving the first problem requires an additional tool,
+// compress_subtree_to_parent_node(). That function always returns the top
+// *two* chaining values of the subtree it's compressing. We then do lazy
+// merging with each of them separately, so that the second CV will always
+// remain unmerged. (That also helps us support extendable output when we're
+// hashing an input all-at-once.)
+INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
+                           uint64_t chunk_counter) {
+  hasher_merge_cv_stack(self, chunk_counter);
+  memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
+         BLAKE3_OUT_LEN);
+  self->cv_stack_len += 1;
+}
+
+INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
+                                      size_t input_len, bool use_tbb) {
+  // Explicitly checking for zero avoids causing UB by passing a null pointer
+  // to memcpy. This comes up in practice with things like:
+  //   std::vector<uint8_t> v;
+  //   blake3_hasher_update(&hasher, v.data(), v.size());
+  if (input_len == 0) {
+    return;
+  }
+
+  const uint8_t *input_bytes = (const uint8_t *)input;
+
+  // If we have some partial chunk bytes in the internal chunk_state, we need
+  // to finish that chunk first.
+  if (chunk_state_len(&self->chunk) > 0) {
+    size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
+    if (take > input_len) {
+      take = input_len;
+    }
+    chunk_state_update(&self->chunk, input_bytes, take);
+    input_bytes += take;
+    input_len -= take;
+    // If we've filled the current chunk and there's more coming, finalize this
+    // chunk and proceed. In this case we know it's not the root.
+    if (input_len > 0) {
+      output_t output = chunk_state_output(&self->chunk);
+      uint8_t chunk_cv[32];
+      output_chaining_value(&output, chunk_cv);
+      hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
+      chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
+    } else {
+      return;
+    }
+  }
+
+  // Now the chunk_state is clear, and we have more input. If there's more than
+  // a single chunk (so, definitely not the root chunk), hash the largest whole
+  // subtree we can, with the full benefits of SIMD (and maybe in the future,
+  // multi-threading) parallelism. Two restrictions:
+  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
+  //   the right edge can be incomplete, and we don't know where the right edge
+  //   is going to be until we get to finalize().
+  // - The subtree must evenly divide the total number of chunks up until this
+  //   point (if total is not 0). If the current incomplete subtree is only
+  //   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
+  //   to complete the current subtree first.
+  // Because we might need to break up the input to form powers of 2, or to
+  // evenly divide what we already have, this part runs in a loop.
+  while (input_len > BLAKE3_CHUNK_LEN) {
+    size_t subtree_len = round_down_to_power_of_2(input_len);
+    uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+    // Shrink the subtree_len until it evenly divides the count so far. We know
+    // that subtree_len itself is a power of 2, so we can use a bitmasking
+    // trick instead of an actual remainder operation. (Note that if the caller
+    // consistently passes power-of-2 inputs of the same size, as is hopefully
+    // typical, this loop condition will always fail, and subtree_len will
+    // always be the full length of the input.)
+    //
+    // An aside: We don't have to shrink subtree_len quite this much. For
+    // example, if count_so_far is 1, we could pass 2 chunks to
+    // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
+    // get the right answer in the end, and we might get to use 2-way SIMD
+    // parallelism. The problem with this optimization, is that it gets us
+    // stuck always hashing 2 chunks. The total number of chunks will remain
+    // odd, and we'll never graduate to higher degrees of parallelism. See
+    // https://github.com/BLAKE3-team/BLAKE3/issues/69.
+    while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
+      subtree_len /= 2;
+    }
+    // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
+    // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
+    uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+    if (subtree_len <= BLAKE3_CHUNK_LEN) {
+      blake3_chunk_state chunk_state;
+      chunk_state_init(&chunk_state, self->key, self->chunk.flags);
+      chunk_state.chunk_counter = self->chunk.chunk_counter;
+      chunk_state_update(&chunk_state, input_bytes, subtree_len);
+      output_t output = chunk_state_output(&chunk_state);
+      uint8_t cv[BLAKE3_OUT_LEN];
+      output_chaining_value(&output, cv);
+      hasher_push_cv(self, cv, chunk_state.chunk_counter);
+    } else {
+      // This is the high-performance happy path, though getting here depends
+      // on the caller giving us a long enough input.
+      uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+      compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
+                                      self->chunk.chunk_counter,
+                                      self->chunk.flags, cv_pair, use_tbb);
+      hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
+      hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
+                     self->chunk.chunk_counter + (subtree_chunks / 2));
+    }
+    self->chunk.chunk_counter += subtree_chunks;
+    input_bytes += subtree_len;
+    input_len -= subtree_len;
+  }
+
+  // If there's any remaining input less than a full chunk, add it to the chunk
+  // state. In that case, also do a final merge loop to make sure the subtree
+  // stack doesn't contain any unmerged pairs. The remaining input means we
+  // know these merges are non-root. This merge loop isn't strictly necessary
+  // here, because hasher_push_chunk_cv already does its own merge loop, but it
+  // simplifies blake3_hasher_finalize below.
+  if (input_len > 0) {
+    chunk_state_update(&self->chunk, input_bytes, input_len);
+    hasher_merge_cv_stack(self, self->chunk.chunk_counter);
+  }
+}
+
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+                          size_t input_len) {
+  bool use_tbb = false;
+  blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+
+#if defined(BLAKE3_USE_TBB)
+void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
+                              size_t input_len) {
+  bool use_tbb = true;
+  blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+#endif // BLAKE3_USE_TBB
+
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                            size_t out_len) {
+  blake3_hasher_finalize_seek(self, 0, out, out_len);
+}
+
+void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+                                 uint8_t *out, size_t out_len) {
+  // Explicitly checking for zero avoids causing UB by passing a null pointer
+  // to memcpy. This comes up in practice with things like:
+  //   std::vector<uint8_t> v;
+  //   blake3_hasher_finalize(&hasher, v.data(), v.size());
+  if (out_len == 0) {
+    return;
+  }
+
+  // If the subtree stack is empty, then the current chunk is the root.
+  if (self->cv_stack_len == 0) {
+    output_t output = chunk_state_output(&self->chunk);
+    output_root_bytes(&output, seek, out, out_len);
+    return;
+  }
+  // If there are any bytes in the chunk state, finalize that chunk and do a
+  // roll-up merge between that chunk hash and every subtree in the stack. In
+  // this case, the extra merge loop at the end of blake3_hasher_update
+  // guarantees that none of the subtrees in the stack need to be merged with
+  // each other first. Otherwise, if there are no bytes in the chunk state,
+  // then the top of the stack is a chunk hash, and we start the merge from
+  // that.
+  output_t output;
+  size_t cvs_remaining;
+  if (chunk_state_len(&self->chunk) > 0) {
+    cvs_remaining = self->cv_stack_len;
+    output = chunk_state_output(&self->chunk);
+  } else {
+    // There are always at least 2 CVs in the stack in this case.
+    cvs_remaining = self->cv_stack_len - 2;
+    output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
+                           self->chunk.flags);
+  }
+  while (cvs_remaining > 0) {
+    cvs_remaining -= 1;
+    uint8_t parent_block[BLAKE3_BLOCK_LEN];
+    memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
+    output_chaining_value(&output, &parent_block[32]);
+    output = parent_output(parent_block, self->key, self->chunk.flags);
+  }
+  output_root_bytes(&output, seek, out, out_len);
+}
+
+void blake3_hasher_reset(blake3_hasher *self) {
+  chunk_state_reset(&self->chunk, self->key, 0);
+  self->cv_stack_len = 0;
+}
--- a/external/blake3/blake3.h
+++ b/external/blake3/blake3.h
@@ -0,0 +1,86 @@
+#ifndef BLAKE3_H
+#define BLAKE3_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if !defined(BLAKE3_API)
+# if defined(_WIN32) || defined(__CYGWIN__)
+#   if defined(BLAKE3_DLL)
+#     if defined(BLAKE3_DLL_EXPORTS)
+#       define BLAKE3_API __declspec(dllexport)
+#     else
+#       define BLAKE3_API __declspec(dllimport)
+#     endif
+#     define BLAKE3_PRIVATE
+#   else
+#     define BLAKE3_API
+#     define BLAKE3_PRIVATE
+#   endif
+# elif __GNUC__ >= 4
+#   define BLAKE3_API __attribute__((visibility("default")))
+#   define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
+# else
+#   define BLAKE3_API
+#   define BLAKE3_PRIVATE
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLAKE3_VERSION_STRING "1.8.2"
+#define BLAKE3_KEY_LEN 32
+#define BLAKE3_OUT_LEN 32
+#define BLAKE3_BLOCK_LEN 64
+#define BLAKE3_CHUNK_LEN 1024
+#define BLAKE3_MAX_DEPTH 54
+
+// This struct is a private implementation detail. It has to be here because
+// it's part of blake3_hasher below.
+typedef struct {
+  uint32_t cv[8];
+  uint64_t chunk_counter;
+  uint8_t buf[BLAKE3_BLOCK_LEN];
+  uint8_t buf_len;
+  uint8_t blocks_compressed;
+  uint8_t flags;
+} blake3_chunk_state;
+
+typedef struct {
+  uint32_t key[8];
+  blake3_chunk_state chunk;
+  uint8_t cv_stack_len;
+  // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
+  // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
+  // requires a 4th entry, rather than merging everything down to 1, because we
+  // don't know whether more input is coming. This is different from how the
+  // reference implementation does things.
+  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
+} blake3_hasher;
+
+BLAKE3_API const char *blake3_version(void);
+BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
+BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
+                                         const uint8_t key[BLAKE3_KEY_LEN]);
+BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
+BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+                                                  size_t context_len);
+BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
+                                     size_t input_len);
+#if defined(BLAKE3_USE_TBB)
+BLAKE3_API void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
+                                         size_t input_len);
+#endif // BLAKE3_USE_TBB
+BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                                       size_t out_len);
+BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+                                            uint8_t *out, size_t out_len);
+BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_H */
--- a/external/blake3/blake3_avx2.c
+++ b/external/blake3/blake3_avx2.c
@@ -0,0 +1,326 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 8
+
+INLINE __m256i loadu(const uint8_t src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE void storeu(__m256i src, uint8_t dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m256i rot16(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+                         13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m256i rot12(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m256i rot8(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
+                         12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m256i rot7(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
+}
+
+INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m256i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  for (size_t i = 0; i < 8; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[8]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m256i *out_lo, __m256i *out_hi) {
+  const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
+  const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  const __m256i add1 = _mm256_and_si256(mask, add0);
+  __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
+  __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), 
+                                     _mm256_xor_si256(   l, _mm256_set1_epi32(0x80000000)));
+  __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m256i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(h_vecs);
+  storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
+#else
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out);
+#endif
+
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+#if !defined(BLAKE3_NO_SSE41)
+  blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                         increment_counter, flags, flags_start, flags_end, out);
+#else
+  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+#endif
+}
--- a/external/blake3/blake3_avx2_x86-64_unix.S
+++ b/external/blake3/blake3_avx2_x86-64_unix.S
--- a/external/blake3/blake3_avx2_x86-64_windows_gnu.S
+++ b/external/blake3/blake3_avx2_x86-64_windows_gnu.S
--- a/external/blake3/blake3_avx2_x86-64_windows_msvc.asm
+++ b/external/blake3/blake3_avx2_x86-64_windows_msvc.asm
--- a/external/blake3/blake3_avx512.c
+++ b/external/blake3/blake3_avx512.c
--- a/external/blake3/blake3_avx512_x86-64_unix.S
+++ b/external/blake3/blake3_avx512_x86-64_unix.S
--- a/external/blake3/blake3_avx512_x86-64_windows_gnu.S
+++ b/external/blake3/blake3_avx512_x86-64_windows_gnu.S
--- a/external/blake3/blake3_avx512_x86-64_windows_msvc.asm
+++ b/external/blake3/blake3_avx512_x86-64_windows_msvc.asm
--- a/external/blake3/blake3_c_rust_bindings/Cargo.toml
+++ b/external/blake3/blake3_c_rust_bindings/Cargo.toml
@@ -0,0 +1,32 @@
+# These are Rust bindings for the C implementation of BLAKE3. As there is a
+# native (and faster) Rust implementation of BLAKE3 provided in this same repo,
+# these bindings are not expected to be used in production. They're intended
+# for testing and benchmarking.
+
+[package]
+name = "blake3_c_rust_bindings"
+version = "0.0.0"
+description = "TESTING ONLY Rust bindings for the BLAKE3 C implementation"
+edition = "2021"
+
+[features]
+# By default the x86-64 build uses assembly implementations. This feature makes
+# the build use the C intrinsics implementations instead.
+prefer_intrinsics = []
+# Activate NEON bindings. We don't currently do any CPU feature detection for
+# this. If this Cargo feature is on, the NEON gets used.
+neon = []
+# Enable TBB-based multithreading.
+tbb = []
+
+[dev-dependencies]
+arrayref = "0.3.5"
+arrayvec = { version = "0.7.0", default-features = false }
+page_size = "0.6.0"
+rand = "0.9.0"
+rand_chacha = "0.9.0"
+reference_impl = { path = "../../reference_impl" }
+
+[build-dependencies]
+cc = "1.0.48"
+ignore = "0.4.23"
--- a/external/blake3/blake3_c_rust_bindings/README.md
+++ b/external/blake3/blake3_c_rust_bindings/README.md
@@ -0,0 +1,4 @@
+These are Rust bindings for the C implementation of BLAKE3. As there is
+a native Rust implementation of BLAKE3 provided in this same repo, these
+bindings are not expected to be used in production. They're intended for
+testing and benchmarking.
--- a/external/blake3/blake3_c_rust_bindings/benches/bench.rs
+++ b/external/blake3/blake3_c_rust_bindings/benches/bench.rs
@@ -0,0 +1,477 @@
+#![feature(test)]
+
+extern crate test;
+
+use arrayref::array_ref;
+use arrayvec::ArrayVec;
+use rand::prelude::*;
+use test::Bencher;
+
+const KIB: usize = 1024;
+const MAX_SIMD_DEGREE: usize = 16;
+
+const BLOCK_LEN: usize = 64;
+const CHUNK_LEN: usize = 1024;
+const OUT_LEN: usize = 32;
+
+// This struct randomizes two things:
+// 1. The actual bytes of input.
+// 2. The page offset the input starts at.
+pub struct RandomInput {
+    buf: Vec<u8>,
+    len: usize,
+    offsets: Vec<usize>,
+    offset_index: usize,
+}
+
+impl RandomInput {
+    pub fn new(b: &mut Bencher, len: usize) -> Self {
+        b.bytes += len as u64;
+        let page_size: usize = page_size::get();
+        let mut buf = vec![0u8; len + page_size];
+        let mut rng = rand::rng();
+        rng.fill_bytes(&mut buf);
+        let mut offsets: Vec<usize> = (0..page_size).collect();
+        offsets.shuffle(&mut rng);
+        Self {
+            buf,
+            len,
+            offsets,
+            offset_index: 0,
+        }
+    }
+
+    pub fn get(&mut self) -> &[u8] {
+        let offset = self.offsets[self.offset_index];
+        self.offset_index += 1;
+        if self.offset_index >= self.offsets.len() {
+            self.offset_index = 0;
+        }
+        &self.buf[offset..][..self.len]
+    }
+}
+
+type CompressInPlaceFn =
+    unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8);
+
+fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) {
+    let mut state = [1u32; 8];
+    let mut r = RandomInput::new(b, 64);
+    let input = array_ref!(r.get(), 0, 64);
+    b.iter(|| unsafe { f(state.as_mut_ptr(), input.as_ptr(), 64, 0, 0) });
+}
+
+#[bench]
+fn bench_single_compression_portable(b: &mut Bencher) {
+    bench_single_compression_fn(
+        b,
+        blake3_c_rust_bindings::ffi::blake3_compress_in_place_portable,
+    );
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_single_compression_sse2(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::sse2_detected() {
+        return;
+    }
+    bench_single_compression_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse2,
+    );
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_single_compression_sse41(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::sse41_detected() {
+        return;
+    }
+    bench_single_compression_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse41,
+    );
+}
+
+#[bench]
+fn bench_single_compression_avx512(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::avx512_detected() {
+        return;
+    }
+    bench_single_compression_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_avx512,
+    );
+}
+
+type HashManyFn = unsafe extern "C" fn(
+    inputs: *const *const u8,
+    num_inputs: usize,
+    blocks: usize,
+    key: *const u32,
+    counter: u64,
+    increment_counter: bool,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: *mut u8,
+);
+
+fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn, degree: usize) {
+    let mut inputs = Vec::new();
+    for _ in 0..degree {
+        inputs.push(RandomInput::new(b, CHUNK_LEN));
+    }
+    b.iter(|| {
+        let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs
+            .iter_mut()
+            .take(degree)
+            .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
+            .collect();
+        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+        unsafe {
+            f(
+                input_arrays.as_ptr() as _,
+                input_arrays.len(),
+                CHUNK_LEN / BLOCK_LEN,
+                [0u32; 8].as_ptr(),
+                0,
+                true,
+                0,
+                0,
+                0,
+                out.as_mut_ptr(),
+            )
+        }
+    });
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_many_chunks_sse2(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::sse2_detected() {
+        return;
+    }
+    bench_many_chunks_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2,
+        4,
+    );
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_many_chunks_sse41(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::sse41_detected() {
+        return;
+    }
+    bench_many_chunks_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41,
+        4,
+    );
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_many_chunks_avx2(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::avx2_detected() {
+        return;
+    }
+    bench_many_chunks_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2,
+        8,
+    );
+}
+
+#[bench]
+fn bench_many_chunks_avx512(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::avx512_detected() {
+        return;
+    }
+    bench_many_chunks_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512,
+        16,
+    );
+}
+
+#[bench]
+#[cfg(feature = "neon")]
+fn bench_many_chunks_neon(b: &mut Bencher) {
+    // When "neon" is on, NEON support is assumed.
+    bench_many_chunks_fn(
+        b,
+        blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon,
+        4,
+    );
+}
+
+// TODO: When we get const generics we can unify this with the chunks code.
+fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn, degree: usize) {
+    let mut inputs = Vec::new();
+    for _ in 0..degree {
+        inputs.push(RandomInput::new(b, BLOCK_LEN));
+    }
+    b.iter(|| {
+        let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs
+            .iter_mut()
+            .take(degree)
+            .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
+            .collect();
+        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+        unsafe {
+            f(
+                input_arrays.as_ptr() as _,
+                input_arrays.len(),
+                1,
+                [0u32; 8].as_ptr(),
+                0,
+                false,
+                0,
+                0,
+                0,
+                out.as_mut_ptr(),
+            )
+        }
+    });
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_many_parents_sse2(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::sse2_detected() {
+        return;
+    }
+    bench_many_parents_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2,
+        4,
+    );
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_many_parents_sse41(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::sse41_detected() {
+        return;
+    }
+    bench_many_parents_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41,
+        4,
+    );
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_many_parents_avx2(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::avx2_detected() {
+        return;
+    }
+    bench_many_parents_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2,
+        8,
+    );
+}
+
+#[bench]
+fn bench_many_parents_avx512(b: &mut Bencher) {
+    if !blake3_c_rust_bindings::avx512_detected() {
+        return;
+    }
+    bench_many_parents_fn(
+        b,
+        blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512,
+        16,
+    );
+}
+
+#[bench]
+#[cfg(feature = "neon")]
+fn bench_many_parents_neon(b: &mut Bencher) {
+    // When "neon" is on, NEON support is assumed.
+    bench_many_parents_fn(
+        b,
+        blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon,
+        4,
+    );
+}
+
+fn bench_incremental(b: &mut Bencher, len: usize) {
+    let mut input = RandomInput::new(b, len);
+    b.iter(|| {
+        let mut hasher = blake3_c_rust_bindings::Hasher::new();
+        hasher.update(input.get());
+        let mut out = [0; 32];
+        hasher.finalize(&mut out);
+        out
+    });
+}
+
+#[bench]
+fn bench_incremental_0001_block(b: &mut Bencher) {
+    bench_incremental(b, BLOCK_LEN);
+}
+
+#[bench]
+fn bench_incremental_0001_kib(b: &mut Bencher) {
+    bench_incremental(b, 1 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0002_kib(b: &mut Bencher) {
+    bench_incremental(b, 2 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0004_kib(b: &mut Bencher) {
+    bench_incremental(b, 4 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0008_kib(b: &mut Bencher) {
+    bench_incremental(b, 8 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0016_kib(b: &mut Bencher) {
+    bench_incremental(b, 16 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0032_kib(b: &mut Bencher) {
+    bench_incremental(b, 32 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0064_kib(b: &mut Bencher) {
+    bench_incremental(b, 64 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0128_kib(b: &mut Bencher) {
+    bench_incremental(b, 128 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0256_kib(b: &mut Bencher) {
+    bench_incremental(b, 256 * KIB);
+}
+
+#[bench]
+fn bench_incremental_0512_kib(b: &mut Bencher) {
+    bench_incremental(b, 512 * KIB);
+}
+
+#[bench]
+fn bench_incremental_1024_kib(b: &mut Bencher) {
+    bench_incremental(b, 1024 * KIB);
+}
+
+#[cfg(feature = "tbb")]
+fn bench_tbb(b: &mut Bencher, len: usize) {
+    let mut input = RandomInput::new(b, len);
+    b.iter(|| {
+        let mut hasher = blake3_c_rust_bindings::Hasher::new();
+        hasher.update_tbb(input.get());
+        let mut out = [0; 32];
+        hasher.finalize(&mut out);
+        out
+    });
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0001_block(b: &mut Bencher) {
+    bench_tbb(b, BLOCK_LEN);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0001_kib(b: &mut Bencher) {
+    bench_tbb(b, 1 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0002_kib(b: &mut Bencher) {
+    bench_tbb(b, 2 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0004_kib(b: &mut Bencher) {
+    bench_tbb(b, 4 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0008_kib(b: &mut Bencher) {
+    bench_tbb(b, 8 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0016_kib(b: &mut Bencher) {
+    bench_tbb(b, 16 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0032_kib(b: &mut Bencher) {
+    bench_tbb(b, 32 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0064_kib(b: &mut Bencher) {
+    bench_tbb(b, 64 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0128_kib(b: &mut Bencher) {
+    bench_tbb(b, 128 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0256_kib(b: &mut Bencher) {
+    bench_tbb(b, 256 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_0512_kib(b: &mut Bencher) {
+    bench_tbb(b, 512 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "tbb")]
+fn bench_tbb_1024_kib(b: &mut Bencher) {
+    bench_tbb(b, 1024 * KIB);
+}
+
+// This checks that update() splits up its input in increasing powers of 2, so
+// that it can recover a high degree of parallelism when the number of bytes
+// hashed so far is uneven. The performance of this benchmark should be
+// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
+// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
+// performance was less than half.
+#[bench]
+fn bench_two_updates(b: &mut Bencher) {
+    let len = 65536;
+    let mut input = RandomInput::new(b, len);
+    b.iter(|| {
+        let mut hasher = blake3_c_rust_bindings::Hasher::new();
+        let input = input.get();
+        hasher.update(&input[..1]);
+        hasher.update(&input[1..]);
+        let mut out = [0; 32];
+        hasher.finalize(&mut out);
+        out
+    });
+}
--- a/external/blake3/blake3_c_rust_bindings/build.rs
+++ b/external/blake3/blake3_c_rust_bindings/build.rs
@@ -0,0 +1,253 @@
+use std::env;
+
+fn defined(var: &str) -> bool {
+    env::var_os(var).is_some()
+}
+
+fn target_components() -> Vec<String> {
+    let target = env::var("TARGET").unwrap();
+    target.split("-").map(|s| s.to_string()).collect()
+}
+
+fn is_x86_64() -> bool {
+    target_components()[0] == "x86_64"
+}
+
+fn is_windows_target() -> bool {
+    env::var("CARGO_CFG_TARGET_OS").unwrap() == "windows"
+}
+
+fn use_msvc_asm() -> bool {
+    const MSVC_NAMES: &[&str] = &["", "cl", "cl.exe"];
+    let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
+    let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default();
+    let target_windows_msvc = target_os == "windows" && target_env == "msvc";
+    let host_triple = env::var("HOST").unwrap_or_default();
+    let target_triple = env::var("TARGET").unwrap_or_default();
+    let cross_compiling = host_triple != target_triple;
+    let cc = env::var("CC").unwrap_or_default().to_ascii_lowercase();
+    if !target_windows_msvc {
+        // We are not building for Windows with the MSVC toolchain.
+        false
+    } else if !cross_compiling && MSVC_NAMES.contains(&&*cc) {
+        // We are building on Windows with the MSVC toolchain (and not cross-compiling for another architecture or target).
+        true
+    } else {
+        // We are cross-compiling to Windows with the MSVC toolchain.
+        let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
+        let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default();
+        let cc = env::var(format!("CC_{target_arch}_{target_vendor}_windows_msvc"))
+            .unwrap_or_default()
+            .to_ascii_lowercase();
+        // Check if we are using the MSVC compiler.
+        MSVC_NAMES.contains(&&*cc)
+    }
+}
+
+fn is_x86_32() -> bool {
+    let arch = &target_components()[0];
+    arch == "i386" || arch == "i586" || arch == "i686"
+}
+
+fn is_armv7() -> bool {
+    target_components()[0] == "armv7"
+}
+
+fn is_aarch64() -> bool {
+    target_components()[0] == "aarch64"
+}
+
+// Windows targets may be using the MSVC toolchain or the GNU toolchain. The
+// right compiler flags to use depend on the toolchain. (And we don't want to
+// use flag_if_supported, because we don't want features to be silently
+// disabled by old compilers.)
+fn is_windows_msvc() -> bool {
+    // Some targets are only two components long, so check in steps.
+    target_components()[1] == "pc"
+        && target_components()[2] == "windows"
+        && target_components()[3] == "msvc"
+}
+
+fn new_build() -> cc::Build {
+    let mut build = cc::Build::new();
+    if !is_windows_msvc() {
+        build.flag("-std=c11");
+    }
+    build
+}
+
+fn new_cpp_build() -> cc::Build {
+    let mut build = cc::Build::new();
+    build.cpp(true);
+    if is_windows_msvc() {
+        build.flag("/std:c++20");
+        build.flag("/EHs-c-");
+        build.flag("/GR-");
+    } else {
+        build.flag("-std=c++20");
+        build.flag("-fno-exceptions");
+        build.flag("-fno-rtti");
+    }
+    build
+}
+
+fn c_dir_path(filename: &str) -> String {
+    // The `cross` tool doesn't support reading files in parent directories. As a hacky workaround
+    // in `cross_test.sh`, we move the c/ directory around and set BLAKE3_C_DIR_OVERRIDE. Regular
+    // building and testing doesn't require this.
+    if let Ok(c_dir_override) = env::var("BLAKE3_C_DIR_OVERRIDE") {
+        c_dir_override + "/" + filename
+    } else {
+        "../".to_string() + filename
+    }
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut base_build = new_build();
+    base_build.file(c_dir_path("blake3.c"));
+    base_build.file(c_dir_path("blake3_dispatch.c"));
+    base_build.file(c_dir_path("blake3_portable.c"));
+    if cfg!(feature = "tbb") {
+        base_build.define("BLAKE3_USE_TBB", "1");
+    }
+    base_build.compile("blake3_base");
+
+    if cfg!(feature = "tbb") {
+        let mut tbb_build = new_cpp_build();
+        tbb_build.define("BLAKE3_USE_TBB", "1");
+        tbb_build.file(c_dir_path("blake3_tbb.cpp"));
+        tbb_build.compile("blake3_tbb");
+        println!("cargo::rustc-link-lib=tbb");
+    }
+
+    if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") {
+        // On 64-bit, use the assembly implementations, unless the
+        // "prefer_intrinsics" feature is enabled.
+        if is_windows_target() {
+            if use_msvc_asm() {
+                let mut build = new_build();
+                build.file(c_dir_path("blake3_sse2_x86-64_windows_msvc.asm"));
+                build.file(c_dir_path("blake3_sse41_x86-64_windows_msvc.asm"));
+                build.file(c_dir_path("blake3_avx2_x86-64_windows_msvc.asm"));
+                build.file(c_dir_path("blake3_avx512_x86-64_windows_msvc.asm"));
+                build.compile("blake3_asm");
+            } else {
+                let mut build = new_build();
+                build.file(c_dir_path("blake3_sse2_x86-64_windows_gnu.S"));
+                build.file(c_dir_path("blake3_sse41_x86-64_windows_gnu.S"));
+                build.file(c_dir_path("blake3_avx2_x86-64_windows_gnu.S"));
+                build.file(c_dir_path("blake3_avx512_x86-64_windows_gnu.S"));
+                build.compile("blake3_asm");
+            }
+        } else {
+            // All non-Windows implementations are assumed to support
+            // Linux-style assembly. These files do contain a small
+            // explicit workaround for macOS also.
+            let mut build = new_build();
+            build.file(c_dir_path("blake3_sse2_x86-64_unix.S"));
+            build.file(c_dir_path("blake3_sse41_x86-64_unix.S"));
+            build.file(c_dir_path("blake3_avx2_x86-64_unix.S"));
+            build.file(c_dir_path("blake3_avx512_x86-64_unix.S"));
+            build.compile("blake3_asm");
+        }
+    } else if is_x86_64() || is_x86_32() {
+        // Assembly implementations are only for 64-bit. On 32-bit, or if
+        // the "prefer_intrinsics" feature is enabled, use the
+        // intrinsics-based C implementations. These each need to be
+        // compiled separately, with the corresponding instruction set
+        // extension explicitly enabled in the compiler.
+
+        let mut sse2_build = new_build();
+        sse2_build.file(c_dir_path("blake3_sse2.c"));
+        if is_windows_msvc() {
+            // /arch:SSE2 is the default on x86 and undefined on x86_64:
+            // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
+            // It also includes SSE4.1 intrinsics:
+            // https://stackoverflow.com/a/32183222/823869
+        } else {
+            sse2_build.flag("-msse2");
+        }
+        sse2_build.compile("blake3_sse2");
+
+        let mut sse41_build = new_build();
+        sse41_build.file(c_dir_path("blake3_sse41.c"));
+        if is_windows_msvc() {
+            // /arch:SSE2 is the default on x86 and undefined on x86_64:
+            // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
+            // It also includes SSE4.1 intrinsics:
+            // https://stackoverflow.com/a/32183222/823869
+        } else {
+            sse41_build.flag("-msse4.1");
+        }
+        sse41_build.compile("blake3_sse41");
+
+        let mut avx2_build = new_build();
+        avx2_build.file(c_dir_path("blake3_avx2.c"));
+        if is_windows_msvc() {
+            avx2_build.flag("/arch:AVX2");
+        } else {
+            avx2_build.flag("-mavx2");
+        }
+        avx2_build.compile("blake3_avx2");
+
+        let mut avx512_build = new_build();
+        avx512_build.file(c_dir_path("blake3_avx512.c"));
+        if is_windows_msvc() {
+            // Note that a lot of versions of MSVC don't support /arch:AVX512,
+            // and they'll discard it with a warning, hopefully leading to a
+            // build error.
+            avx512_build.flag("/arch:AVX512");
+        } else {
+            avx512_build.flag("-mavx512f");
+            avx512_build.flag("-mavx512vl");
+        }
+        avx512_build.compile("blake3_avx512");
+    }
+
+    // We only build NEON code here if
+    // 1) it's requested
+    // and 2) the root crate is not already building it.
+    // The only time this will really happen is if you build this
+    // crate by hand with the "neon" feature for some reason.
+    //
+    // In addition, 3) if the target is aarch64, NEON is on by default.
+    if defined("CARGO_FEATURE_NEON") || is_aarch64() {
+        let mut neon_build = new_build();
+        neon_build.file(c_dir_path("blake3_neon.c"));
+        // ARMv7 platforms that support NEON generally need the following
+        // flags. AArch64 supports NEON by default and does not support -mpfu.
+        if is_armv7() {
+            neon_build.flag("-mfpu=neon-vfpv4");
+            neon_build.flag("-mfloat-abi=hard");
+        }
+        neon_build.compile("blake3_neon");
+    }
+
+    // The `cc` crate does not automatically emit rerun-if directives for the
+    // environment variables it supports, in particular for $CC. We expect to
+    // do a lot of benchmarking across different compilers, so we explicitly
+    // add the variables that we're likely to need.
+    println!("cargo:rerun-if-env-changed=CC");
+    println!("cargo:rerun-if-env-changed=CFLAGS");
+
+    // Ditto for source files, though these shouldn't change as often. `ignore::Walk` respects
+    // .gitignore, so this doesn't traverse target/.
+    for result in ignore::Walk::new("..") {
+        let result = result?;
+        let path = result.path();
+        if path.is_file() {
+            println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
+        }
+    }
+
+    // When compiling with clang-cl for windows, it adds .asm files to the root
+    // which we need to delete so cargo doesn't get angry
+    if is_windows_target() && !use_msvc_asm() {
+        let _ = std::fs::remove_file("blake3_avx2_x86-64_windows_gnu.asm");
+        let _ = std::fs::remove_file("blake3_avx512_x86-64_windows_gnu.asm");
+        let _ = std::fs::remove_file("blake3_sse2_x86-64_windows_gnu.asm");
+        let _ = std::fs::remove_file("blake3_sse41_x86-64_windows_gnu.asm");
+    }
+
+    Ok(())
+}
--- a/external/blake3/blake3_c_rust_bindings/cross_test.sh
+++ b/external/blake3/blake3_c_rust_bindings/cross_test.sh
@@ -0,0 +1,31 @@
+#! /usr/bin/env bash
+
+# This hacky script works around the fact that `cross test` does not support
+# path dependencies. (It uses a docker shared folder to let the guest access
+# project files, so parent directories aren't available.) Solve this problem by
+# copying the entire project to a temp dir and rearranging paths to put "c" and
+# "reference_impl" underneath "blake3_c_rust_bindings", so that everything is
+# accessible. Hopefully this will just run on CI forever and no one will ever
+# read this and discover my deep shame.
+
+set -e -u -o pipefail
+
+project_root="$(realpath "$(dirname "$BASH_SOURCE")/../..")"
+tmpdir="$(mktemp -d)"
+echo "Running cross tests in $tmpdir"
+cd "$tmpdir"
+git clone "$project_root" blake3
+mv blake3/c/blake3_c_rust_bindings .
+mv blake3/reference_impl blake3_c_rust_bindings
+mv blake3/c blake3_c_rust_bindings
+cd blake3_c_rust_bindings
+sed -i 's|reference_impl = { path = "../../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
+
+export BLAKE3_C_DIR_OVERRIDE="./c"
+cat > Cross.toml << EOF
+[build.env]
+passthrough = [
+    "BLAKE3_C_DIR_OVERRIDE",
+]
+EOF
+cross test "$@"
--- a/external/blake3/blake3_c_rust_bindings/src/lib.rs
+++ b/external/blake3/blake3_c_rust_bindings/src/lib.rs
@@ -0,0 +1,333 @@
+//! These are Rust bindings for the C implementation of BLAKE3. As there is a
+//! native (and faster) Rust implementation of BLAKE3 provided in this same
+//! repo, these bindings are not expected to be used in production. They're
+//! intended for testing and benchmarking.
+
+use std::ffi::{c_void, CString};
+use std::mem::MaybeUninit;
+
+#[cfg(test)]
+mod test;
+
+pub const BLOCK_LEN: usize = 64;
+pub const CHUNK_LEN: usize = 1024;
+pub const OUT_LEN: usize = 32;
+
+// Feature detection functions for tests and benchmarks. Note that the C code
+// does its own feature detection in blake3_dispatch.c.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn sse2_detected() -> bool {
+    is_x86_feature_detected!("sse2")
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn sse41_detected() -> bool {
+    is_x86_feature_detected!("sse4.1")
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn avx2_detected() -> bool {
+    is_x86_feature_detected!("avx2")
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn avx512_detected() -> bool {
+    is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl")
+}
+
+#[derive(Clone)]
+pub struct Hasher(ffi::blake3_hasher);
+
+impl Hasher {
+    pub fn new() -> Self {
+        let mut c_state = MaybeUninit::uninit();
+        unsafe {
+            ffi::blake3_hasher_init(c_state.as_mut_ptr());
+            Self(c_state.assume_init())
+        }
+    }
+
+    pub fn new_keyed(key: &[u8; 32]) -> Self {
+        let mut c_state = MaybeUninit::uninit();
+        unsafe {
+            ffi::blake3_hasher_init_keyed(c_state.as_mut_ptr(), key.as_ptr());
+            Self(c_state.assume_init())
+        }
+    }
+
+    pub fn new_derive_key(context: &str) -> Self {
+        let mut c_state = MaybeUninit::uninit();
+        let context_c_string = CString::new(context).expect("valid C string, no null bytes");
+        unsafe {
+            ffi::blake3_hasher_init_derive_key(c_state.as_mut_ptr(), context_c_string.as_ptr());
+            Self(c_state.assume_init())
+        }
+    }
+
+    pub fn new_derive_key_raw(context: &[u8]) -> Self {
+        let mut c_state = MaybeUninit::uninit();
+        unsafe {
+            ffi::blake3_hasher_init_derive_key_raw(
+                c_state.as_mut_ptr(),
+                context.as_ptr() as *const _,
+                context.len(),
+            );
+            Self(c_state.assume_init())
+        }
+    }
+
+    pub fn update(&mut self, input: &[u8]) {
+        unsafe {
+            ffi::blake3_hasher_update(&mut self.0, input.as_ptr() as *const c_void, input.len());
+        }
+    }
+
+    #[cfg(feature = "tbb")]
+    pub fn update_tbb(&mut self, input: &[u8]) {
+        unsafe {
+            ffi::blake3_hasher_update_tbb(
+                &mut self.0,
+                input.as_ptr() as *const c_void,
+                input.len(),
+            );
+        }
+    }
+
+    pub fn finalize(&self, output: &mut [u8]) {
+        unsafe {
+            ffi::blake3_hasher_finalize(&self.0, output.as_mut_ptr(), output.len());
+        }
+    }
+
+    pub fn finalize_seek(&self, seek: u64, output: &mut [u8]) {
+        unsafe {
+            ffi::blake3_hasher_finalize_seek(&self.0, seek, output.as_mut_ptr(), output.len());
+        }
+    }
+
+    pub fn reset(&mut self) {
+        unsafe {
+            ffi::blake3_hasher_reset(&mut self.0);
+        }
+    }
+}
+
+pub mod ffi {
+    #[repr(C)]
+    #[derive(Copy, Clone)]
+    pub struct blake3_chunk_state {
+        pub cv: [u32; 8usize],
+        pub chunk_counter: u64,
+        pub buf: [u8; 64usize],
+        pub buf_len: u8,
+        pub blocks_compressed: u8,
+        pub flags: u8,
+    }
+
+    #[repr(C)]
+    #[derive(Copy, Clone)]
+    pub struct blake3_hasher {
+        pub key: [u32; 8usize],
+        pub chunk: blake3_chunk_state,
+        pub cv_stack_len: u8,
+        pub cv_stack: [u8; 1728usize],
+    }
+
+    extern "C" {
+        // public interface
+        pub fn blake3_hasher_init(self_: *mut blake3_hasher);
+        pub fn blake3_hasher_init_keyed(self_: *mut blake3_hasher, key: *const u8);
+        pub fn blake3_hasher_init_derive_key(
+            self_: *mut blake3_hasher,
+            context: *const ::std::os::raw::c_char,
+        );
+        pub fn blake3_hasher_init_derive_key_raw(
+            self_: *mut blake3_hasher,
+            context: *const ::std::os::raw::c_void,
+            context_len: usize,
+        );
+        pub fn blake3_hasher_update(
+            self_: *mut blake3_hasher,
+            input: *const ::std::os::raw::c_void,
+            input_len: usize,
+        );
+        #[cfg(feature = "tbb")]
+        pub fn blake3_hasher_update_tbb(
+            self_: *mut blake3_hasher,
+            input: *const ::std::os::raw::c_void,
+            input_len: usize,
+        );
+        pub fn blake3_hasher_finalize(self_: *const blake3_hasher, out: *mut u8, out_len: usize);
+        pub fn blake3_hasher_finalize_seek(
+            self_: *const blake3_hasher,
+            seek: u64,
+            out: *mut u8,
+            out_len: usize,
+        );
+        pub fn blake3_hasher_reset(self_: *mut blake3_hasher);
+
+        // portable low-level functions
+        pub fn blake3_compress_in_place_portable(
+            cv: *mut u32,
+            block: *const u8,
+            block_len: u8,
+            counter: u64,
+            flags: u8,
+        );
+        pub fn blake3_compress_xof_portable(
+            cv: *const u32,
+            block: *const u8,
+            block_len: u8,
+            counter: u64,
+            flags: u8,
+            out: *mut u8,
+        );
+        pub fn blake3_hash_many_portable(
+            inputs: *const *const u8,
+            num_inputs: usize,
+            blocks: usize,
+            key: *const u32,
+            counter: u64,
+            increment_counter: bool,
+            flags: u8,
+            flags_start: u8,
+            flags_end: u8,
+            out: *mut u8,
+        );
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub mod x86 {
+        extern "C" {
+            // SSE2 low level functions
+            pub fn blake3_compress_in_place_sse2(
+                cv: *mut u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+            );
+            pub fn blake3_compress_xof_sse2(
+                cv: *const u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+                out: *mut u8,
+            );
+            pub fn blake3_hash_many_sse2(
+                inputs: *const *const u8,
+                num_inputs: usize,
+                blocks: usize,
+                key: *const u32,
+                counter: u64,
+                increment_counter: bool,
+                flags: u8,
+                flags_start: u8,
+                flags_end: u8,
+                out: *mut u8,
+            );
+
+            // SSE4.1 low level functions
+            pub fn blake3_compress_in_place_sse41(
+                cv: *mut u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+            );
+            pub fn blake3_compress_xof_sse41(
+                cv: *const u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+                out: *mut u8,
+            );
+            pub fn blake3_hash_many_sse41(
+                inputs: *const *const u8,
+                num_inputs: usize,
+                blocks: usize,
+                key: *const u32,
+                counter: u64,
+                increment_counter: bool,
+                flags: u8,
+                flags_start: u8,
+                flags_end: u8,
+                out: *mut u8,
+            );
+
+            // AVX2 low level functions
+            pub fn blake3_hash_many_avx2(
+                inputs: *const *const u8,
+                num_inputs: usize,
+                blocks: usize,
+                key: *const u32,
+                counter: u64,
+                increment_counter: bool,
+                flags: u8,
+                flags_start: u8,
+                flags_end: u8,
+                out: *mut u8,
+            );
+
+            // AVX-512 low level functions
+            pub fn blake3_compress_xof_avx512(
+                cv: *const u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+                out: *mut u8,
+            );
+            pub fn blake3_compress_in_place_avx512(
+                cv: *mut u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+            );
+            pub fn blake3_hash_many_avx512(
+                inputs: *const *const u8,
+                num_inputs: usize,
+                blocks: usize,
+                key: *const u32,
+                counter: u64,
+                increment_counter: bool,
+                flags: u8,
+                flags_start: u8,
+                flags_end: u8,
+                out: *mut u8,
+            );
+            #[cfg(unix)]
+            pub fn blake3_xof_many_avx512(
+                cv: *const u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+                out: *mut u8,
+                outblocks: usize,
+            );
+        }
+    }
+
+    #[cfg(feature = "neon")]
+    pub mod neon {
+        extern "C" {
+            // NEON low level functions
+            pub fn blake3_hash_many_neon(
+                inputs: *const *const u8,
+                num_inputs: usize,
+                blocks: usize,
+                key: *const u32,
+                counter: u64,
+                increment_counter: bool,
+                flags: u8,
+                flags_start: u8,
+                flags_end: u8,
+                out: *mut u8,
+            );
+        }
+    }
+}
--- a/external/blake3/blake3_c_rust_bindings/src/test.rs
+++ b/external/blake3/blake3_c_rust_bindings/src/test.rs
@@ -0,0 +1,696 @@
+// Most of this code is duplicated from the root `blake3` crate. Perhaps we
+// could share more of it in the future.
+
+use crate::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
+use arrayref::{array_mut_ref, array_ref};
+use arrayvec::ArrayVec;
+use core::usize;
+use rand::prelude::*;
+
+const CHUNK_START: u8 = 1 << 0;
+const CHUNK_END: u8 = 1 << 1;
+const PARENT: u8 = 1 << 2;
+const ROOT: u8 = 1 << 3;
+const KEYED_HASH: u8 = 1 << 4;
+// const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
+// const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
+
+// Interesting input lengths to run tests on.
+pub const TEST_CASES: &[usize] = &[
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    BLOCK_LEN - 1,
+    BLOCK_LEN,
+    BLOCK_LEN + 1,
+    2 * BLOCK_LEN - 1,
+    2 * BLOCK_LEN,
+    2 * BLOCK_LEN + 1,
+    CHUNK_LEN - 1,
+    CHUNK_LEN,
+    CHUNK_LEN + 1,
+    2 * CHUNK_LEN,
+    2 * CHUNK_LEN + 1,
+    3 * CHUNK_LEN,
+    3 * CHUNK_LEN + 1,
+    4 * CHUNK_LEN,
+    4 * CHUNK_LEN + 1,
+    5 * CHUNK_LEN,
+    5 * CHUNK_LEN + 1,
+    6 * CHUNK_LEN,
+    6 * CHUNK_LEN + 1,
+    7 * CHUNK_LEN,
+    7 * CHUNK_LEN + 1,
+    8 * CHUNK_LEN,
+    8 * CHUNK_LEN + 1,
+    16 * CHUNK_LEN,  // AVX512's bandwidth
+    31 * CHUNK_LEN,  // 16 + 8 + 4 + 2 + 1
+    100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks
+];
+
+pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN;
+
+// There's a test to make sure these two are equal below.
+pub const TEST_KEY: [u8; 32] = *b"whats the Elvish word for friend";
+pub const TEST_KEY_WORDS: [u32; 8] = [
+    1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
+];
+
+// Paint the input with a repeating byte pattern. We use a cycle length of 251,
+// because that's the largest prime number less than 256. This makes it
+// unlikely to swapping any two adjacent input blocks or chunks will give the
+// same answer.
+fn paint_test_input(buf: &mut [u8]) {
+    for (i, b) in buf.iter_mut().enumerate() {
+        *b = (i % 251) as u8;
+    }
+}
+
+#[inline(always)]
+fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
+    let mut out = [0; 32];
+    *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
+    *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
+    *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
+    *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
+    *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
+    *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
+    *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
+    *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
+    out
+}
+
+type CompressInPlaceFn =
+    unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8);
+
+type CompressXofFn = unsafe extern "C" fn(
+    cv: *const u32,
+    block: *const u8,
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+    out: *mut u8,
+);
+
+// A shared helper function for platform-specific tests.
+pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
+    let initial_state = TEST_KEY_WORDS;
+    let block_len: u8 = 61;
+    let mut block = [0; BLOCK_LEN];
+    paint_test_input(&mut block[..block_len as usize]);
+    // Use a counter with set bits in both 32-bit words.
+    let counter = (5u64 << 32) + 6;
+    let flags = CHUNK_END | ROOT | KEYED_HASH;
+
+    let mut portable_out = [0; 64];
+    unsafe {
+        crate::ffi::blake3_compress_xof_portable(
+            initial_state.as_ptr(),
+            block.as_ptr(),
+            block_len,
+            counter,
+            flags,
+            portable_out.as_mut_ptr(),
+        );
+    }
+
+    let mut test_state = initial_state;
+    unsafe {
+        compress_in_place_fn(
+            test_state.as_mut_ptr(),
+            block.as_ptr(),
+            block_len,
+            counter,
+            flags,
+        )
+    };
+    let test_state_bytes = le_bytes_from_words_32(&test_state);
+    let mut test_xof = [0; 64];
+    unsafe {
+        compress_xof_fn(
+            initial_state.as_ptr(),
+            block.as_ptr(),
+            block_len,
+            counter,
+            flags,
+            test_xof.as_mut_ptr(),
+        )
+    };
+
+    assert_eq!(&portable_out[..32], &test_state_bytes[..]);
+    assert_eq!(&portable_out[..], &test_xof[..]);
+}
+
+// Testing the portable implementation against itself is circular, but why not.
+#[test]
+fn test_compress_portable() {
+    test_compress_fn(
+        crate::ffi::blake3_compress_in_place_portable,
+        crate::ffi::blake3_compress_xof_portable,
+    );
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_compress_sse2() {
+    if !crate::sse2_detected() {
+        return;
+    }
+    test_compress_fn(
+        crate::ffi::x86::blake3_compress_in_place_sse2,
+        crate::ffi::x86::blake3_compress_xof_sse2,
+    );
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_compress_sse41() {
+    if !crate::sse41_detected() {
+        return;
+    }
+    test_compress_fn(
+        crate::ffi::x86::blake3_compress_in_place_sse41,
+        crate::ffi::x86::blake3_compress_xof_sse41,
+    );
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_compress_avx512() {
+    if !crate::avx512_detected() {
+        return;
+    }
+    test_compress_fn(
+        crate::ffi::x86::blake3_compress_in_place_avx512,
+        crate::ffi::x86::blake3_compress_xof_avx512,
+    );
+}
+
+type HashManyFn = unsafe extern "C" fn(
+    inputs: *const *const u8,
+    num_inputs: usize,
+    blocks: usize,
+    key: *const u32,
+    counter: u64,
+    increment_counter: bool,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: *mut u8,
+);
+
+// A shared helper function for platform-specific tests.
+pub fn test_hash_many_fn(hash_many_fn: HashManyFn) {
+    // Test a few different initial counter values.
+    // - 0: The base case.
+    // - u32::MAX: The low word of the counter overflows for all inputs except the first.
+    // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR
+    //   when you're supposed to ANDNOT...
+    let initial_counters = [0, u32::MAX as u64, i32::MAX as u64];
+    for counter in initial_counters {
+        dbg!(counter);
+
+        // 31 (16 + 8 + 4 + 2 + 1) inputs
+        const NUM_INPUTS: usize = 31;
+        let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
+        crate::test::paint_test_input(&mut input_buf);
+
+        // First hash chunks.
+        let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new();
+        for i in 0..NUM_INPUTS {
+            chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN));
+        }
+        let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
+        unsafe {
+            crate::ffi::blake3_hash_many_portable(
+                chunks.as_ptr() as _,
+                chunks.len(),
+                CHUNK_LEN / BLOCK_LEN,
+                TEST_KEY_WORDS.as_ptr(),
+                counter,
+                true,
+                KEYED_HASH,
+                CHUNK_START,
+                CHUNK_END,
+                portable_chunks_out.as_mut_ptr(),
+            );
+        }
+
+        let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN];
+        unsafe {
+            hash_many_fn(
+                chunks.as_ptr() as _,
+                chunks.len(),
+                CHUNK_LEN / BLOCK_LEN,
+                TEST_KEY_WORDS.as_ptr(),
+                counter,
+                true,
+                KEYED_HASH,
+                CHUNK_START,
+                CHUNK_END,
+                test_chunks_out.as_mut_ptr(),
+            );
+        }
+        for n in 0..NUM_INPUTS {
+            dbg!(n);
+            assert_eq!(
+                &portable_chunks_out[n * OUT_LEN..][..OUT_LEN],
+                &test_chunks_out[n * OUT_LEN..][..OUT_LEN]
+            );
+        }
+
+        // Then hash parents.
+        let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new();
+        for i in 0..NUM_INPUTS {
+            parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN));
+        }
+        let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
+        unsafe {
+            crate::ffi::blake3_hash_many_portable(
+                parents.as_ptr() as _,
+                parents.len(),
+                1,
+                TEST_KEY_WORDS.as_ptr(),
+                counter,
+                false,
+                KEYED_HASH | PARENT,
+                0,
+                0,
+                portable_parents_out.as_mut_ptr(),
+            );
+        }
+
+        let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN];
+        unsafe {
+            hash_many_fn(
+                parents.as_ptr() as _,
+                parents.len(),
+                1,
+                TEST_KEY_WORDS.as_ptr(),
+                counter,
+                false,
+                KEYED_HASH | PARENT,
+                0,
+                0,
+                test_parents_out.as_mut_ptr(),
+            );
+        }
+        for n in 0..NUM_INPUTS {
+            dbg!(n);
+            assert_eq!(
+                &portable_parents_out[n * OUT_LEN..][..OUT_LEN],
+                &test_parents_out[n * OUT_LEN..][..OUT_LEN]
+            );
+        }
+    }
+}
+
+// Testing the portable implementation against itself is circular, but why not.
+#[test]
+fn test_hash_many_portable() {
+    test_hash_many_fn(crate::ffi::blake3_hash_many_portable);
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_hash_many_sse2() {
+    if !crate::sse2_detected() {
+        return;
+    }
+    test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse2);
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_hash_many_sse41() {
+    if !crate::sse41_detected() {
+        return;
+    }
+    test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse41);
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_hash_many_avx2() {
+    if !crate::avx2_detected() {
+        return;
+    }
+    test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx2);
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_hash_many_avx512() {
+    if !crate::avx512_detected() {
+        return;
+    }
+    test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx512);
+}
+
+#[test]
+#[cfg(feature = "neon")]
+fn test_hash_many_neon() {
+    test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon);
+}
+
+#[allow(unused)]
+type XofManyFunction = unsafe extern "C" fn(
+    cv: *const u32,
+    block: *const u8,
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+    out: *mut u8,
+    outblocks: usize,
+);
+
+// A shared helper function for platform-specific tests.
+#[allow(unused)]
+pub fn test_xof_many_fn(xof_many_function: XofManyFunction) {
+    let mut block = [0; BLOCK_LEN];
+    let block_len = 42;
+    crate::test::paint_test_input(&mut block[..block_len]);
+    let cv = [40, 41, 42, 43, 44, 45, 46, 47];
+    let flags = KEYED_HASH;
+
+    // Test a few different initial counter values.
+    // - 0: The base case.
+    // - u32::MAX: The low word of the counter overflows for all inputs except the first.
+    // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR
+    //   when you're supposed to ANDNOT...
+    let initial_counters = [0, u32::MAX as u64, i32::MAX as u64];
+    for counter in initial_counters {
+        dbg!(counter);
+
+        // 31 (16 + 8 + 4 + 2 + 1) outputs
+        const OUTPUT_SIZE: usize = 31 * BLOCK_LEN;
+
+        let mut portable_out = [0u8; OUTPUT_SIZE];
+        for (i, out_block) in portable_out.chunks_exact_mut(BLOCK_LEN).enumerate() {
+            unsafe {
+                crate::ffi::blake3_compress_xof_portable(
+                    cv.as_ptr(),
+                    block.as_ptr(),
+                    block_len as u8,
+                    counter + i as u64,
+                    flags,
+                    out_block.as_mut_ptr(),
+                );
+            }
+        }
+
+        let mut test_out = [0u8; OUTPUT_SIZE];
+        unsafe {
+            xof_many_function(
+                cv.as_ptr(),
+                block.as_ptr(),
+                block_len as u8,
+                counter,
+                flags,
+                test_out.as_mut_ptr(),
+                OUTPUT_SIZE / BLOCK_LEN,
+            );
+        }
+
+        assert_eq!(portable_out, test_out);
+    }
+
+    // Test that xof_many doesn't write more blocks than requested. Note that the current assembly
+    // implementation always outputs at least one block, so we don't test the zero case.
+    for block_count in 1..=32 {
+        let mut array = [0; BLOCK_LEN * 33];
+        let output_start = 17;
+        let output_len = block_count * BLOCK_LEN;
+        let output_end = output_start + output_len;
+        let output = &mut array[output_start..output_end];
+        unsafe {
+            xof_many_function(
+                cv.as_ptr(),
+                block.as_ptr(),
+                block_len as u8,
+                0,
+                flags,
+                output.as_mut_ptr(),
+                block_count,
+            );
+        }
+        for i in 0..array.len() {
+            if i < output_start || output_end <= i {
+                assert_eq!(0, array[i], "index {i}");
+            }
+        }
+    }
+}
+
+#[test]
+#[cfg(unix)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_xof_many_avx512() {
+    if !crate::avx512_detected() {
+        return;
+    }
+    test_xof_many_fn(crate::ffi::x86::blake3_xof_many_avx512);
+}
+
+#[test]
+fn test_compare_reference_impl() {
+    const OUT: usize = 303; // more than 64, not a multiple of 4
+    let mut input_buf = [0; TEST_CASES_MAX];
+    paint_test_input(&mut input_buf);
+    for &case in TEST_CASES {
+        let input = &input_buf[..case];
+        dbg!(case);
+
+        // regular
+        {
+            let mut reference_hasher = reference_impl::Hasher::new();
+            reference_hasher.update(input);
+            let mut expected_out = [0; OUT];
+            reference_hasher.finalize(&mut expected_out);
+
+            let mut test_hasher = crate::Hasher::new();
+            test_hasher.update(input);
+            let mut test_out = [0; OUT];
+            test_hasher.finalize(&mut test_out);
+            assert_eq!(test_out[..], expected_out[..]);
+
+            #[cfg(feature = "tbb")]
+            {
+                let mut tbb_hasher = crate::Hasher::new();
+                tbb_hasher.update_tbb(input);
+                let mut tbb_out = [0; OUT];
+                tbb_hasher.finalize(&mut tbb_out);
+                assert_eq!(tbb_out[..], expected_out[..]);
+            }
+        }
+
+        // keyed
+        {
+            let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+            reference_hasher.update(input);
+            let mut expected_out = [0; OUT];
+            reference_hasher.finalize(&mut expected_out);
+
+            let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY);
+            test_hasher.update(input);
+            let mut test_out = [0; OUT];
+            test_hasher.finalize(&mut test_out);
+            assert_eq!(test_out[..], expected_out[..]);
+
+            #[cfg(feature = "tbb")]
+            {
+                let mut tbb_hasher = crate::Hasher::new_keyed(&TEST_KEY);
+                tbb_hasher.update_tbb(input);
+                let mut tbb_out = [0; OUT];
+                tbb_hasher.finalize(&mut tbb_out);
+                assert_eq!(tbb_out[..], expected_out[..]);
+            }
+        }
+
+        // derive_key
+        {
+            let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)";
+            let mut reference_hasher = reference_impl::Hasher::new_derive_key(context);
+            reference_hasher.update(input);
+            let mut expected_out = [0; OUT];
+            reference_hasher.finalize(&mut expected_out);
+
+            // the regular C string API
+            let mut test_hasher = crate::Hasher::new_derive_key(context);
+            test_hasher.update(input);
+            let mut test_out = [0; OUT];
+            test_hasher.finalize(&mut test_out);
+            assert_eq!(test_out[..], expected_out[..]);
+
+            // the raw bytes API
+            let mut test_hasher_raw = crate::Hasher::new_derive_key_raw(context.as_bytes());
+            test_hasher_raw.update(input);
+            let mut test_out_raw = [0; OUT];
+            test_hasher_raw.finalize(&mut test_out_raw);
+            assert_eq!(test_out_raw[..], expected_out[..]);
+
+            #[cfg(feature = "tbb")]
+            {
+                let mut tbb_hasher = crate::Hasher::new_derive_key(context);
+                tbb_hasher.update_tbb(input);
+                let mut tbb_out = [0; OUT];
+                tbb_hasher.finalize(&mut tbb_out);
+                assert_eq!(tbb_out[..], expected_out[..]);
+            }
+        }
+    }
+}
+
+fn reference_hash(input: &[u8]) -> [u8; OUT_LEN] {
+    let mut hasher = reference_impl::Hasher::new();
+    hasher.update(input);
+    let mut bytes = [0; OUT_LEN];
+    hasher.finalize(&mut bytes);
+    bytes.into()
+}
+
+#[test]
+fn test_compare_update_multiple() {
+    // Don't use all the long test cases here, since that's unnecessarily slow
+    // in debug mode.
+    let mut short_test_cases = TEST_CASES;
+    while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN {
+        short_test_cases = &short_test_cases[..short_test_cases.len() - 1];
+    }
+    assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN);
+
+    let mut input_buf = [0; 2 * TEST_CASES_MAX];
+    paint_test_input(&mut input_buf);
+
+    for &first_update in short_test_cases {
+        dbg!(first_update);
+        let first_input = &input_buf[..first_update];
+        let mut test_hasher = crate::Hasher::new();
+        test_hasher.update(first_input);
+
+        for &second_update in short_test_cases {
+            dbg!(second_update);
+            let second_input = &input_buf[first_update..][..second_update];
+            let total_input = &input_buf[..first_update + second_update];
+
+            // Clone the hasher with first_update bytes already written, so
+            // that the next iteration can reuse it.
+            let mut test_hasher = test_hasher.clone();
+            test_hasher.update(second_input);
+            let mut test_out = [0; OUT_LEN];
+            test_hasher.finalize(&mut test_out);
+
+            let expected = reference_hash(total_input);
+            assert_eq!(expected, test_out);
+        }
+    }
+}
+
+#[test]
+fn test_fuzz_hasher() {
+    const INPUT_MAX: usize = 4 * CHUNK_LEN;
+    let mut input_buf = [0; 3 * INPUT_MAX];
+    paint_test_input(&mut input_buf);
+
+    // Don't do too many iterations in debug mode, to keep the tests under a
+    // second or so. CI should run tests in release mode also. Provide an
+    // environment variable for specifying a larger number of fuzz iterations.
+    let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 };
+
+    // Use a fixed RNG seed for reproducibility.
+    let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]);
+    for _num_test in 0..num_tests {
+        dbg!(_num_test);
+        let mut hasher = crate::Hasher::new();
+        let mut total_input = 0;
+        // For each test, write 3 inputs of random length.
+        for _ in 0..3 {
+            let input_len = rng.random_range(0..INPUT_MAX + 1);
+            dbg!(input_len);
+            let input = &input_buf[total_input..][..input_len];
+            hasher.update(input);
+            total_input += input_len;
+        }
+        let expected = reference_hash(&input_buf[..total_input]);
+        let mut test_out = [0; 32];
+        hasher.finalize(&mut test_out);
+        assert_eq!(expected, test_out);
+    }
+}
+
+#[test]
+fn test_finalize_seek() {
+    let mut expected = [0; 1000];
+    {
+        let mut reference_hasher = reference_impl::Hasher::new();
+        reference_hasher.update(b"foobarbaz");
+        reference_hasher.finalize(&mut expected);
+    }
+
+    let mut test_hasher = crate::Hasher::new();
+    test_hasher.update(b"foobarbaz");
+
+    let mut out = [0; 103];
+    for &seek in &[0, 1, 7, 59, 63, 64, 65, 501, expected.len() - out.len()] {
+        dbg!(seek);
+        test_hasher.finalize_seek(seek as u64, &mut out);
+        assert_eq!(&expected[seek..][..out.len()], &out[..]);
+    }
+}
+
+#[test]
+fn test_reset() {
+    {
+        let mut hasher = crate::Hasher::new();
+        hasher.update(&[42; 3 * CHUNK_LEN + 7]);
+        hasher.reset();
+        hasher.update(&[42; CHUNK_LEN + 3]);
+        let mut output = [0; 32];
+        hasher.finalize(&mut output);
+
+        let mut reference_hasher = reference_impl::Hasher::new();
+        reference_hasher.update(&[42; CHUNK_LEN + 3]);
+        let mut reference_hash = [0; 32];
+        reference_hasher.finalize(&mut reference_hash);
+
+        assert_eq!(reference_hash, output);
+    }
+    {
+        let key = &[99; 32];
+        let mut hasher = crate::Hasher::new_keyed(key);
+        hasher.update(&[42; 3 * CHUNK_LEN + 7]);
+        hasher.reset();
+        hasher.update(&[42; CHUNK_LEN + 3]);
+        let mut output = [0; 32];
+        hasher.finalize(&mut output);
+
+        let mut reference_hasher = reference_impl::Hasher::new_keyed(key);
+        reference_hasher.update(&[42; CHUNK_LEN + 3]);
+        let mut reference_hash = [0; 32];
+        reference_hasher.finalize(&mut reference_hash);
+
+        assert_eq!(reference_hash, output);
+    }
+    {
+        let context = "BLAKE3 2020-02-12 10:20:58 reset test";
+        let mut hasher = crate::Hasher::new_derive_key(context);
+        hasher.update(&[42; 3 * CHUNK_LEN + 7]);
+        hasher.reset();
+        hasher.update(&[42; CHUNK_LEN + 3]);
+        let mut output = [0; 32];
+        hasher.finalize(&mut output);
+
+        let mut reference_hasher = reference_impl::Hasher::new_derive_key(context);
+        reference_hasher.update(&[42; CHUNK_LEN + 3]);
+        let mut reference_hash = [0; 32];
+        reference_hasher.finalize(&mut reference_hash);
+
+        assert_eq!(reference_hash, output);
+    }
+}
--- a/external/blake3/blake3_dispatch.c
+++ b/external/blake3/blake3_dispatch.c
@@ -0,0 +1,332 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "blake3_impl.h"
+
+#if defined(_MSC_VER)
+#include <Windows.h>
+#endif
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__GNUC__)
+#include <immintrin.h>
+#else
+#undef IS_X86 /* Unimplemented! */
+#endif
+#endif
+
+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
+
+#define MAYBE_UNUSED(x) (void)((x))
+
+#if defined(IS_X86)
+static uint64_t xgetbv(void) {
+#if defined(_MSC_VER)
+  return _xgetbv(0);
+#else
+  uint32_t eax = 0, edx = 0;
+  __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
+  return ((uint64_t)edx << 32) | eax;
+#endif
+}
+
+static void cpuid(uint32_t out[4], uint32_t id) {
+#if defined(_MSC_VER)
+  __cpuid((int *)out, id);
+#elif defined(__i386__) || defined(_M_IX86)
+  __asm__ __volatile__("movl %%ebx, %1\n"
+                       "cpuid\n"
+                       "xchgl %1, %%ebx\n"
+                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id));
+#else
+  __asm__ __volatile__("cpuid\n"
+                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id));
+#endif
+}
+
+static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
+#if defined(_MSC_VER)
+  __cpuidex((int *)out, id, sid);
+#elif defined(__i386__) || defined(_M_IX86)
+  __asm__ __volatile__("movl %%ebx, %1\n"
+                       "cpuid\n"
+                       "xchgl %1, %%ebx\n"
+                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id), "c"(sid));
+#else
+  __asm__ __volatile__("cpuid\n"
+                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id), "c"(sid));
+#endif
+}
+
+#endif
+
+enum cpu_feature {
+  SSE2 = 1 << 0,
+  SSSE3 = 1 << 1,
+  SSE41 = 1 << 2,
+  AVX = 1 << 3,
+  AVX2 = 1 << 4,
+  AVX512F = 1 << 5,
+  AVX512VL = 1 << 6,
+  /* ... */
+  UNDEFINED = 1 << 30
+};
+
+#if !defined(BLAKE3_TESTING)
+static /* Allow the variable to be controlled manually for testing */
+#endif
+    ATOMIC_INT g_cpu_features = UNDEFINED;
+
+#if !defined(BLAKE3_TESTING)
+static
+#endif
+    enum cpu_feature
+    get_cpu_features(void) {
+
+  /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+  enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+  if (features != UNDEFINED) {
+    return features;
+  } else {
+#if defined(IS_X86)
+    uint32_t regs[4] = {0};
+    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
+    (void)edx;
+    features = 0;
+    cpuid(regs, 0);
+    const int max_id = *eax;
+    cpuid(regs, 1);
+#if defined(__amd64__) || defined(_M_X64)
+    features |= SSE2;
+#else
+    if (*edx & (1UL << 26))
+      features |= SSE2;
+#endif
+    if (*ecx & (1UL << 9))
+      features |= SSSE3;
+    if (*ecx & (1UL << 19))
+      features |= SSE41;
+
+    if (*ecx & (1UL << 27)) { // OSXSAVE
+      const uint64_t mask = xgetbv();
+      if ((mask & 6) == 6) { // SSE and AVX states
+        if (*ecx & (1UL << 28))
+          features |= AVX;
+        if (max_id >= 7) {
+          cpuidex(regs, 7, 0);
+          if (*ebx & (1UL << 5))
+            features |= AVX2;
+          if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
+            if (*ebx & (1UL << 31))
+              features |= AVX512VL;
+            if (*ebx & (1UL << 16))
+              features |= AVX512F;
+          }
+        }
+      }
+    }
+    ATOMIC_STORE(g_cpu_features, features);
+    return features;
+#else
+    /* How to detect NEON? */
+    return 0;
+#endif
+  }
+}
+
+void blake3_compress_in_place(uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if (features & AVX512VL) {
+    blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
+    return;
+  }
+#endif
+#endif
+  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+}
+
+void blake3_compress_xof(const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags,
+                         uint8_t out[64]) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if (features & AVX512VL) {
+    blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
+    return;
+  }
+#endif
+#endif
+  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
+}
+
+
+void blake3_xof_many(const uint32_t cv[8],
+                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                     uint8_t block_len, uint64_t counter, uint8_t flags,
+                     uint8_t out[64], size_t outblocks) {
+  if (outblocks == 0) {
+    // The current assembly implementation always outputs at least 1 block.
+    return;
+  }
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+  if (features & AVX512VL) {
+    blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
+    return;
+  }
+#endif
+#endif
+  for(size_t i = 0; i < outblocks; ++i) {
+    blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
+  }
+}
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                      size_t blocks, const uint32_t key[8], uint64_t counter,
+                      bool increment_counter, uint8_t flags,
+                      uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+    blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+  if (features & AVX2) {
+    blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+                          increment_counter, flags, flags_start, flags_end,
+                          out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                           increment_counter, flags, flags_start, flags_end,
+                           out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+                          increment_counter, flags, flags_start, flags_end,
+                          out);
+    return;
+  }
+#endif
+#endif
+
+#if BLAKE3_USE_NEON == 1
+  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
+                        increment_counter, flags, flags_start, flags_end, out);
+  return;
+#endif
+
+  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+}
+
+// The dynamically detected SIMD degree of the current platform.
+size_t blake3_simd_degree(void) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+    return 16;
+  }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+  if (features & AVX2) {
+    return 8;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    return 4;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    return 4;
+  }
+#endif
+#endif
+#if BLAKE3_USE_NEON == 1
+  return 4;
+#endif
+  return 1;
+}
--- a/external/blake3/blake3_impl.h
+++ b/external/blake3/blake3_impl.h
@@ -0,0 +1,333 @@
+#ifndef BLAKE3_IMPL_H
+#define BLAKE3_IMPL_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "blake3.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// internal flags
+enum blake3_flags {
+  CHUNK_START         = 1 << 0,
+  CHUNK_END           = 1 << 1,
+  PARENT              = 1 << 2,
+  ROOT                = 1 << 3,
+  KEYED_HASH          = 1 << 4,
+  DERIVE_KEY_CONTEXT  = 1 << 5,
+  DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+// This C implementation tries to support recent versions of GCC, Clang, and
+// MSVC.
+#if defined(_MSC_VER)
+#define INLINE static __forceinline
+#else
+#define INLINE static inline __attribute__((always_inline))
+#endif
+
+#ifdef __cplusplus
+#define NOEXCEPT noexcept
+#else
+#define NOEXCEPT
+#endif
+
+#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
+#define IS_X86
+#define IS_X86_64
+#endif
+
+#if defined(__i386__) || defined(_M_IX86)
+#define IS_X86
+#define IS_X86_32
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#define IS_AARCH64
+#endif
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#endif
+
+#if !defined(BLAKE3_USE_NEON) 
+  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
+  #if defined(IS_AARCH64)
+    #if defined(__ARM_BIG_ENDIAN)
+      #define BLAKE3_USE_NEON 0
+    #else
+      #define BLAKE3_USE_NEON 1
+    #endif
+  #else
+    #define BLAKE3_USE_NEON 0
+  #endif
+#endif
+
+#if defined(IS_X86)
+#define MAX_SIMD_DEGREE 16
+#elif BLAKE3_USE_NEON == 1
+#define MAX_SIMD_DEGREE 4
+#else
+#define MAX_SIMD_DEGREE 1
+#endif
+
+// There are some places where we want a static size that's equal to the
+// MAX_SIMD_DEGREE, but also at least 2.
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
+                               0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
+                               0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t MSG_SCHEDULE[7][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+    {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+    {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+    {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+    {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+    {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+/* Find index of the highest set bit */
+/* x is assumed to be nonzero.       */
+static unsigned int highest_one(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+  return 63 ^ (unsigned int)__builtin_clzll(x);
+#elif defined(_MSC_VER) && defined(IS_X86_64)
+  unsigned long index;
+  _BitScanReverse64(&index, x);
+  return index;
+#elif defined(_MSC_VER) && defined(IS_X86_32)
+  if(x >> 32) {
+    unsigned long index;
+    _BitScanReverse(&index, (unsigned long)(x >> 32));
+    return 32 + index;
+  } else {
+    unsigned long index;
+    _BitScanReverse(&index, (unsigned long)x);
+    return index;
+  }
+#else
+  unsigned int c = 0;
+  if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
+  if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
+  if(x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; }
+  if(x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; }
+  if(x & 0x000000000000000cULL) { x >>=  2; c +=  2; }
+  if(x & 0x0000000000000002ULL) {           c +=  1; }
+  return c;
+#endif
+}
+
+// Count the number of 1 bits.
+INLINE unsigned int popcnt(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+  return (unsigned int)__builtin_popcountll(x);
+#else
+  unsigned int count = 0;
+  while (x != 0) {
+    count += 1;
+    x &= x - 1;
+  }
+  return count;
+#endif
+}
+
+// Largest power of two less than or equal to x. As a special case, returns 1
+// when x is 0. 
+INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
+  return 1ULL << highest_one(x | 1);
+}
+
+INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
+
+INLINE uint32_t counter_high(uint64_t counter) {
+  return (uint32_t)(counter >> 32);
+}
+
+INLINE uint32_t load32(const void *src) {
+  const uint8_t *p = (const uint8_t *)src;
+  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+         ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+                           uint32_t key_words[8]) {
+  key_words[0] = load32(&key[0 * 4]);
+  key_words[1] = load32(&key[1 * 4]);
+  key_words[2] = load32(&key[2 * 4]);
+  key_words[3] = load32(&key[3 * 4]);
+  key_words[4] = load32(&key[4 * 4]);
+  key_words[5] = load32(&key[5 * 4]);
+  key_words[6] = load32(&key[6 * 4]);
+  key_words[7] = load32(&key[7 * 4]);
+}
+
+INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
+                             uint32_t block_words[16]) {
+  for (size_t i = 0; i < 16; i++) {
+      block_words[i] = load32(&block[i * 4]);
+  }
+}
+
+INLINE void store32(void *dst, uint32_t w) {
+  uint8_t *p = (uint8_t *)dst;
+  p[0] = (uint8_t)(w >> 0);
+  p[1] = (uint8_t)(w >> 8);
+  p[2] = (uint8_t)(w >> 16);
+  p[3] = (uint8_t)(w >> 24);
+}
+
+INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
+  store32(&bytes_out[0 * 4], cv_words[0]);
+  store32(&bytes_out[1 * 4], cv_words[1]);
+  store32(&bytes_out[2 * 4], cv_words[2]);
+  store32(&bytes_out[3 * 4], cv_words[3]);
+  store32(&bytes_out[4 * 4], cv_words[4]);
+  store32(&bytes_out[5 * 4], cv_words[5]);
+  store32(&bytes_out[6 * 4], cv_words[6]);
+  store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+void blake3_compress_in_place(uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags);
+
+void blake3_compress_xof(const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags,
+                         uint8_t out[64]);
+
+void blake3_xof_many(const uint32_t cv[8],
+                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                     uint8_t block_len, uint64_t counter, uint8_t flags,
+                     uint8_t out[64], size_t outblocks);
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                      size_t blocks, const uint32_t key[8], uint64_t counter,
+                      bool increment_counter, uint8_t flags,
+                      uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+size_t blake3_simd_degree(void);
+
+BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+                                                   const uint32_t key[8],
+                                                   uint64_t chunk_counter, uint8_t flags,
+                                                   uint8_t *out, bool use_tbb);
+
+#if defined(BLAKE3_USE_TBB)
+BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb(
+    // shared params
+    const uint32_t key[8], uint8_t flags, bool use_tbb,
+    // left-hand side params
+    const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
+    uint8_t *l_cvs, size_t *l_n,
+    // right-hand side params
+    const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
+    uint8_t *r_cvs, size_t *r_n) NOEXCEPT;
+#endif
+
+// Declarations for implementation-specific functions.
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags);
+
+void blake3_compress_xof_portable(const uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags, uint8_t out[64]);
+
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out);
+
+#if defined(IS_X86)
+#if !defined(BLAKE3_NO_SSE2)
+void blake3_compress_in_place_sse2(uint32_t cv[8],
+                                   const uint8_t block[BLAKE3_BLOCK_LEN],
+                                   uint8_t block_len, uint64_t counter,
+                                   uint8_t flags);
+void blake3_compress_xof_sse2(const uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags, uint8_t out[64]);
+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags);
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags, uint8_t out[64]);
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_AVX512)
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+                                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                                     uint8_t block_len, uint64_t counter,
+                                     uint8_t flags);
+
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+                                const uint8_t block[BLAKE3_BLOCK_LEN],
+                                uint8_t block_len, uint64_t counter,
+                                uint8_t flags, uint8_t out[64]);
+
+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
+                             size_t blocks, const uint32_t key[8],
+                             uint64_t counter, bool increment_counter,
+                             uint8_t flags, uint8_t flags_start,
+                             uint8_t flags_end, uint8_t *out);
+
+#if !defined(_WIN32)
+void blake3_xof_many_avx512(const uint32_t cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter, uint8_t flags,
+                            uint8_t* out, size_t outblocks);
+#endif
+#endif
+#endif
+
+#if BLAKE3_USE_NEON == 1
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_IMPL_H */
--- a/external/blake3/blake3_neon.c
+++ b/external/blake3/blake3_neon.c
@@ -0,0 +1,366 @@
+#include "blake3_impl.h"
+
+#include <arm_neon.h>
+
+#ifdef __ARM_BIG_ENDIAN
+#error "This implementation only supports little-endian ARM."
+// It might be that all we need for big-endian support here is to get the loads
+// and stores right, but step zero would be finding a way to test it in CI.
+#endif
+
+INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
+  // vld1q_u32 has alignment requirements. Don't use it.
+  return vreinterpretq_u32_u8(vld1q_u8(src));
+}
+
+INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
+  // vst1q_u32 has alignment requirements. Don't use it.
+  vst1q_u8(dest, vreinterpretq_u8_u32(src));
+}
+
+INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
+  return vaddq_u32(a, b);
+}
+
+INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
+  return veorq_u32(a, b);
+}
+
+INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
+
+INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  uint32_t array[4] = {a, b, c, d};
+  return vld1q_u32(array);
+}
+
+INLINE uint32x4_t rot16_128(uint32x4_t x) {
+  // The straightforward implementation would be two shifts and an or, but that's
+  // slower on microarchitectures we've tested. See
+  // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+  // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
+}
+
+INLINE uint32x4_t rot12_128(uint32x4_t x) {
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
+}
+
+INLINE uint32x4_t rot8_128(uint32x4_t x) {
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+  return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+  static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+  return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else 
+  return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
+}
+
+INLINE uint32x4_t rot7_128(uint32x4_t x) {
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
+}
+
+// TODO: compress_neon
+
+// TODO: hash2_neon
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_neon
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[15] = rot16_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot12_128(v[4]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[15] = rot8_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot7_128(v[4]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot16_128(v[15]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[4] = rot12_128(v[4]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot8_128(v[15]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+  v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
+  // Individually transpose the four 2x2 sub-matrices in each corner.
+  uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
+  uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
+
+  // Swap the top-right and bottom-left 2x2s (which just got transposed).
+  vecs[0] =
+      vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
+  vecs[1] =
+      vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
+  vecs[2] =
+      vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
+  vecs[3] =
+      vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
+}
+
+INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
+                                size_t block_offset, uint32x4_t out[16]) {
+  out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
+  transpose_vecs_128(&out[0]);
+  transpose_vecs_128(&out[4]);
+  transpose_vecs_128(&out[8]);
+  transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+                           uint32x4_t *out_low, uint32x4_t *out_high) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  *out_low = set4(
+      counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+      counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
+  *out_high = set4(
+      counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+      counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
+}
+
+void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  uint32x4_t h_vecs[8] = {
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+  };
+  uint32x4_t counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+    uint32x4_t block_flags_vec = set1_128(block_flags);
+    uint32x4_t msg_vecs[16];
+    transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    uint32x4_t v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn4(v, msg_vecs, 0);
+    round_fn4(v, msg_vecs, 1);
+    round_fn4(v, msg_vecs, 2);
+    round_fn4(v, msg_vecs, 3);
+    round_fn4(v, msg_vecs, 4);
+    round_fn4(v, msg_vecs, 5);
+    round_fn4(v, msg_vecs, 6);
+    h_vecs[0] = xor_128(v[0], v[8]);
+    h_vecs[1] = xor_128(v[1], v[9]);
+    h_vecs[2] = xor_128(v[2], v[10]);
+    h_vecs[3] = xor_128(v[3], v[11]);
+    h_vecs[4] = xor_128(v[4], v[12]);
+    h_vecs[5] = xor_128(v[5], v[13]);
+    h_vecs[6] = xor_128(v[6], v[14]);
+    h_vecs[7] = xor_128(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_128(&h_vecs[0]);
+  transpose_vecs_128(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_neon
+ * ----------------------------------------------------------------------------
+ */
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags);
+
+INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
+                          const uint32_t key[8], uint64_t counter,
+                          uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+                          uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    // TODO: Implement compress_neon. However note that according to
+    // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
+    // compress_neon might not be any faster than compress_portable.
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                      block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= 4) {
+    blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
+    inputs += 4;
+    num_inputs -= 4;
+    out = &out[4 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
+                  flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
--- a/external/blake3/blake3_portable.c
+++ b/external/blake3/blake3_portable.c
@@ -0,0 +1,160 @@
+#include "blake3_impl.h"
+#include <string.h>
+
+INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
+  return (w >> c) | (w << (32 - c));
+}
+
+INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
+              uint32_t x, uint32_t y) {
+  state[a] = state[a] + state[b] + x;
+  state[d] = rotr32(state[d] ^ state[a], 16);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 12);
+  state[a] = state[a] + state[b] + y;
+  state[d] = rotr32(state[d] ^ state[a], 8);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
+  // Select the message schedule based on the round.
+  const uint8_t *schedule = MSG_SCHEDULE[round];
+
+  // Mix the columns.
+  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+  // Mix the rows.
+  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  uint32_t block_words[16];
+  block_words[0] = load32(block + 4 * 0);
+  block_words[1] = load32(block + 4 * 1);
+  block_words[2] = load32(block + 4 * 2);
+  block_words[3] = load32(block + 4 * 3);
+  block_words[4] = load32(block + 4 * 4);
+  block_words[5] = load32(block + 4 * 5);
+  block_words[6] = load32(block + 4 * 6);
+  block_words[7] = load32(block + 4 * 7);
+  block_words[8] = load32(block + 4 * 8);
+  block_words[9] = load32(block + 4 * 9);
+  block_words[10] = load32(block + 4 * 10);
+  block_words[11] = load32(block + 4 * 11);
+  block_words[12] = load32(block + 4 * 12);
+  block_words[13] = load32(block + 4 * 13);
+  block_words[14] = load32(block + 4 * 14);
+  block_words[15] = load32(block + 4 * 15);
+
+  state[0] = cv[0];
+  state[1] = cv[1];
+  state[2] = cv[2];
+  state[3] = cv[3];
+  state[4] = cv[4];
+  state[5] = cv[5];
+  state[6] = cv[6];
+  state[7] = cv[7];
+  state[8] = IV[0];
+  state[9] = IV[1];
+  state[10] = IV[2];
+  state[11] = IV[3];
+  state[12] = counter_low(counter);
+  state[13] = counter_high(counter);
+  state[14] = (uint32_t)block_len;
+  state[15] = (uint32_t)flags;
+
+  round_fn(state, &block_words[0], 0);
+  round_fn(state, &block_words[0], 1);
+  round_fn(state, &block_words[0], 2);
+  round_fn(state, &block_words[0], 3);
+  round_fn(state, &block_words[0], 4);
+  round_fn(state, &block_words[0], 5);
+  round_fn(state, &block_words[0], 6);
+}
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags) {
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+  cv[0] = state[0] ^ state[8];
+  cv[1] = state[1] ^ state[9];
+  cv[2] = state[2] ^ state[10];
+  cv[3] = state[3] ^ state[11];
+  cv[4] = state[4] ^ state[12];
+  cv[5] = state[5] ^ state[13];
+  cv[6] = state[6] ^ state[14];
+  cv[7] = state[7] ^ state[15];
+}
+
+void blake3_compress_xof_portable(const uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags, uint8_t out[64]) {
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+
+  store32(&out[0 * 4], state[0] ^ state[8]);
+  store32(&out[1 * 4], state[1] ^ state[9]);
+  store32(&out[2 * 4], state[2] ^ state[10]);
+  store32(&out[3 * 4], state[3] ^ state[11]);
+  store32(&out[4 * 4], state[4] ^ state[12]);
+  store32(&out[5 * 4], state[5] ^ state[13]);
+  store32(&out[6 * 4], state[6] ^ state[14]);
+  store32(&out[7 * 4], state[7] ^ state[15]);
+  store32(&out[8 * 4], state[8] ^ cv[0]);
+  store32(&out[9 * 4], state[9] ^ cv[1]);
+  store32(&out[10 * 4], state[10] ^ cv[2]);
+  store32(&out[11 * 4], state[11] ^ cv[3]);
+  store32(&out[12 * 4], state[12] ^ cv[4]);
+  store32(&out[13 * 4], state[13] ^ cv[5]);
+  store32(&out[14 * 4], state[14] ^ cv[6]);
+  store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
+                              const uint32_t key[8], uint64_t counter,
+                              uint8_t flags, uint8_t flags_start,
+                              uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                      block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  store_cv_words(out, cv);
+}
+
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out) {
+  while (num_inputs > 0) {
+    hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
+                      flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
--- a/external/blake3/blake3_sse2.c
+++ b/external/blake3/blake3_sse2.c
@@ -0,0 +1,566 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const uint8_t src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, uint8_t dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+  return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
+}
+
+INLINE __m128i rot12(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
+}
+
+INLINE __m128i rot7(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot16(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot8(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
+  const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+  __m128i mask = _mm_set1_epi16(imm8);
+  mask = _mm_and_si128(mask, bits);
+  mask = _mm_cmpeq_epi16(mask, bits);
+  return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  rows[0] = loadu((uint8_t *)&cv[0]);
+  rows[1] = loadu((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
+
+  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse2(uint32_t cv[8],
+                                   const uint8_t block[BLAKE3_BLOCK_LEN],
+                                   uint8_t block_len, uint64_t counter,
+                                   uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+void blake3_compress_xof_sse2(const uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), &out[0]);
+  storeu(xorv(rows[1], rows[3]), &out[16]);
+  storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
+  storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m128i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[4]);
+  transpose_vecs(&out[8]);
+  transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m128i *out_lo, __m128i *out_hi) {
+  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+  const __m128i add1 = _mm_and_si128(mask, add0);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
+  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
+                                  _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m128i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(&h_vecs[0]);
+  transpose_vecs(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
+                          const uint32_t key[8], uint64_t counter,
+                          uint8_t flags, uint8_t flags_start,
+                          uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                  block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
+                  flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
--- a/external/blake3/blake3_sse2_x86-64_unix.S
+++ b/external/blake3/blake3_sse2_x86-64_unix.S
--- a/external/blake3/blake3_sse2_x86-64_windows_gnu.S
+++ b/external/blake3/blake3_sse2_x86-64_windows_gnu.S
--- a/external/blake3/blake3_sse2_x86-64_windows_msvc.asm
+++ b/external/blake3/blake3_sse2_x86-64_windows_msvc.asm
--- a/external/blake3/blake3_sse41.c
+++ b/external/blake3/blake3_sse41.c
@@ -0,0 +1,560 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const uint8_t src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, uint8_t dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m128i rot12(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m128i rot7(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot16(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot8(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  rows[0] = loadu((uint8_t *)&cv[0]);
+  rows[1] = loadu((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
+
+  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), &out[0]);
+  storeu(xorv(rows[1], rows[3]), &out[16]);
+  storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
+  storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m128i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[4]);
+  transpose_vecs(&out[8]);
+  transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m128i *out_lo, __m128i *out_hi) {
+  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+  const __m128i add1 = _mm_and_si128(mask, add0);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
+  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
+                                  _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
+                        const uint32_t key[8], uint64_t counter,
+                        bool increment_counter, uint8_t flags,
+                        uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m128i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(&h_vecs[0]);
+  transpose_vecs(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
+                           const uint32_t key[8], uint64_t counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                   block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
+                       flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
+                   flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
--- a/external/blake3/blake3_sse41_x86-64_unix.S
+++ b/external/blake3/blake3_sse41_x86-64_unix.S
--- a/external/blake3/blake3_sse41_x86-64_windows_gnu.S
+++ b/external/blake3/blake3_sse41_x86-64_windows_gnu.S
--- a/external/blake3/blake3_sse41_x86-64_windows_msvc.asm
+++ b/external/blake3/blake3_sse41_x86-64_windows_msvc.asm
--- a/external/blake3/blake3_tbb.cpp
+++ b/external/blake3/blake3_tbb.cpp
@@ -0,0 +1,37 @@
+#include <cstddef>
+#include <cstdint>
+
+#include <oneapi/tbb/parallel_invoke.h>
+
+#include "blake3_impl.h"
+
+static_assert(TBB_USE_EXCEPTIONS == 0,
+              "This file should be compiled with C++ exceptions disabled.");
+
+extern "C" void blake3_compress_subtree_wide_join_tbb(
+    // shared params
+    const uint32_t key[8], uint8_t flags, bool use_tbb,
+    // left-hand side params
+    const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
+    uint8_t *l_cvs, size_t *l_n,
+    // right-hand side params
+    const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
+    uint8_t *r_cvs, size_t *r_n) noexcept {
+  if (!use_tbb) {
+    *l_n = blake3_compress_subtree_wide(l_input, l_input_len, key,
+                                        l_chunk_counter, flags, l_cvs, use_tbb);
+    *r_n = blake3_compress_subtree_wide(r_input, r_input_len, key,
+                                        r_chunk_counter, flags, r_cvs, use_tbb);
+    return;
+  }
+
+  oneapi::tbb::parallel_invoke(
+      [=]() {
+        *l_n = blake3_compress_subtree_wide(
+            l_input, l_input_len, key, l_chunk_counter, flags, l_cvs, use_tbb);
+      },
+      [=]() {
+        *r_n = blake3_compress_subtree_wide(
+            r_input, r_input_len, key, r_chunk_counter, flags, r_cvs, use_tbb);
+      });
+}
--- a/external/blake3/cmake/BLAKE3/ContinuousIntegration.cmake
+++ b/external/blake3/cmake/BLAKE3/ContinuousIntegration.cmake
@@ -0,0 +1,235 @@
+cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
+
+if(BUILD_SHARED_LIBS)
+  message(FATAL_ERROR "BUILD_SHARED_LIBS is incompatible with BLAKE3_TESTING_CI")
+endif()
+
+include(CTest)
+
+# Declare a testing specific variant of the `blake3` library target.
+#
+# We use a separate library target in order to be able to perform compilation with various
+# combinations of features which are too noisy to specify in the main CMake config as options for
+# the normal `blake3` target.
+#
+# Initially this target has no properties but eventually we will populate them by copying all of the
+# relevant properties from the normal `blake3` target.
+add_library(blake3-testing
+  blake3.c
+  blake3_dispatch.c
+  blake3_portable.c
+)
+
+if(BLAKE3_USE_TBB AND TBB_FOUND)
+  target_sources(blake3-testing
+    PRIVATE
+      blake3_tbb.cpp)
+endif()
+
+if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm")
+  # Conditionally add amd64 asm files to `blake3-testing` sources
+  if(MSVC)
+    if(NOT BLAKE3_NO_AVX2)
+      list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_msvc.asm)
+    endif()
+    if(NOT BLAKE3_NO_AVX512)
+      list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_msvc.asm)
+    endif()
+    if(NOT BLAKE3_NO_SSE2)
+      list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_msvc.asm)
+    endif()
+    if(NOT BLAKE3_NO_SSE41)
+      list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_msvc.asm)
+    endif()
+  elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
+        OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
+        OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
+    if (WIN32)
+      if(NOT BLAKE3_NO_AVX2)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_gnu.S)
+      endif()
+      if(NOT BLAKE3_NO_AVX512)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_gnu.S)
+      endif()
+      if(NOT BLAKE3_NO_SSE2)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_gnu.S)
+      endif()
+      if(NOT BLAKE3_NO_SSE41)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_gnu.S)
+      endif()
+    elseif(UNIX)
+      if(NOT BLAKE3_NO_AVX2)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_unix.S)
+      endif()
+      if(NOT BLAKE3_NO_AVX512)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_unix.S)
+      endif()
+      if(NOT BLAKE3_NO_SSE2)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_unix.S)
+      endif()
+      if(NOT BLAKE3_NO_SSE41)
+        list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_unix.S)
+      endif()
+    endif()
+  endif()
+  target_sources(blake3-testing PRIVATE ${BLAKE3_AMD64_ASM_SOURCES})
+elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics")
+  # Conditionally add amd64 C files to `blake3-testing` sources
+  if (NOT DEFINED BLAKE3_CFLAGS_SSE2
+      OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1
+      OR NOT DEFINED BLAKE3_CFLAGS_AVX2
+      OR NOT DEFINED BLAKE3_CFLAGS_AVX512)
+    message(WARNING "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.")
+  else()
+    set(BLAKE3_SIMD_X86_INTRINSICS ON)
+  endif()
+
+  if(NOT BLAKE3_NO_AVX2)
+    target_sources(blake3-testing PRIVATE blake3_avx2.c)
+    set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}")
+  endif()
+  if(NOT BLAKE3_NO_AVX512)
+    target_sources(blake3-testing PRIVATE blake3_avx512.c)
+    set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}")
+  endif()
+  if(NOT BLAKE3_NO_SSE2)
+    target_sources(blake3-testing PRIVATE blake3_sse2.c)
+    set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
+  endif()
+  if(NOT BLAKE3_NO_SSE41)
+    target_sources(blake3-testing PRIVATE blake3_sse41.c)
+    set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")
+  endif()
+
+elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics")
+  # Conditionally add neon C files to `blake3-testing` sources
+
+  target_sources(blake3-testing PRIVATE
+    blake3_neon.c
+  )
+  target_compile_definitions(blake3-testing PRIVATE
+    BLAKE3_USE_NEON=1
+  )
+
+  if (DEFINED BLAKE3_CFLAGS_NEON)
+    set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
+  endif()
+
+elseif(BLAKE3_SIMD_TYPE STREQUAL "none")
+  # Disable neon if simd type is "none". We check for individual amd64 features further below.
+
+  target_compile_definitions(blake3-testing PRIVATE
+    BLAKE3_USE_NEON=0
+  )
+
+endif()
+
+if(BLAKE3_NO_AVX2)
+  target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX2)
+endif()
+if(BLAKE3_NO_AVX512)
+  target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX512)
+endif()
+if(BLAKE3_NO_SSE2)
+  target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE2)
+endif()
+if(BLAKE3_NO_SSE41)
+  target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE41)
+endif()
+
+target_compile_definitions(blake3-testing PUBLIC BLAKE3_TESTING)
+
+get_target_property(BLAKE3_COMPILE_DEFINITIONS blake3 COMPILE_DEFINITIONS)
+if(BLAKE3_COMPILE_DEFINITIONS)
+  target_compile_definitions(blake3-testing PUBLIC
+    ${BLAKE3_COMPILE_DEFINITIONS})
+endif()
+
+get_target_property(BLAKE3_COMPILE_OPTIONS blake3 COMPILE_OPTIONS)
+if(BLAKE3_COMPILE_OPTIONS)
+  target_compile_options(blake3-testing PRIVATE
+    ${BLAKE3_COMPILE_OPTIONS}
+    -O3
+    -Wall
+    -Wextra
+    -pedantic
+    -fstack-protector-strong
+    -D_FORTIFY_SOURCE=2
+    -fPIE
+    -fvisibility=hidden
+    -fsanitize=address,undefined
+  )
+endif()
+
+get_target_property(BLAKE3_INCLUDE_DIRECTORIES blake3 INCLUDE_DIRECTORIES)
+if(BLAKE3_INCLUDE_DIRECTORIES)
+  target_include_directories(blake3-testing PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  )
+endif()
+
+get_target_property(BLAKE3_LINK_LIBRARIES blake3 LINK_LIBRARIES)
+if(BLAKE3_LINK_LIBRARIES)
+  target_link_libraries(blake3-testing PRIVATE ${BLAKE3_LINK_LIBRARIES})
+endif()
+
+get_target_property(BLAKE3_LINK_OPTIONS blake3 LINK_OPTIONS)
+if(BLAKE3_LINK_OPTIONS)
+  target_link_options(blake3-testing PRIVATE
+    ${BLAKE3_LINK_OPTIONS}
+    -fsanitize=address,undefined
+    -pie
+    -Wl,-z,relro,-z,now
+  )
+endif()
+
+# test asm target
+add_executable(blake3-asm-test
+  main.c
+)
+set_target_properties(blake3-asm-test PROPERTIES
+  OUTPUT_NAME blake3
+  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR})
+target_link_libraries(blake3-asm-test PRIVATE blake3-testing)
+target_compile_definitions(blake3-asm-test PRIVATE BLAKE3_TESTING)
+target_compile_options(blake3-asm-test PRIVATE
+  -O3
+  -Wall
+  -Wextra
+  -pedantic
+  -fstack-protector-strong
+  -D_FORTIFY_SOURCE=2
+  -fPIE
+  -fvisibility=hidden
+  -fsanitize=address,undefined
+)
+target_link_options(blake3-asm-test PRIVATE
+  -fsanitize=address,undefined
+  -pie
+  -Wl,-z,relro,-z,now
+)
+
+add_test(NAME blake3-testing
+  COMMAND "${CMAKE_CTEST_COMMAND}"
+    --verbose
+    --extra-verbose
+    --build-and-test "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}"
+    --build-generator "${CMAKE_GENERATOR}"
+    --build-makeprogram "${CMAKE_MAKE_PROGRAM}"
+    --build-project libblake3
+    --build-target blake3-asm-test
+    --build-options
+      --fresh
+      "-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}"
+      "-DBLAKE3_TESTING=${BLAKE3_TESTING}"
+      "-DBLAKE3_TESTING_CI=${BLAKE3_TESTING_CI}"
+      "-DBLAKE3_USE_TBB=${BLAKE3_USE_TBB}"
+      "-DBLAKE3_SIMD_TYPE=${BLAKE3_SIMD_TYPE}"
+      "-DBLAKE3_NO_SSE2=${BLAKE3_NO_SSE2}"
+      "-DBLAKE3_NO_SSE41=${BLAKE3_NO_SSE41}"
+      "-DBLAKE3_NO_AVX2=${BLAKE3_NO_AVX2}"
+      "-DBLAKE3_NO_AVX512=${BLAKE3_NO_AVX512}"
+    --test-command
+      "${CMAKE_SOURCE_DIR}/test.py"
+  )
--- a/external/blake3/cmake/BLAKE3/Examples.cmake
+++ b/external/blake3/cmake/BLAKE3/Examples.cmake
@@ -0,0 +1,13 @@
+if(NOT WIN32)
+  add_executable(blake3-example
+    example.c)
+  target_link_libraries(blake3-example PRIVATE blake3)
+  install(TARGETS blake3-example)
+
+  if(BLAKE3_USE_TBB)
+    add_executable(blake3-example-tbb
+      example_tbb.c)
+    target_link_libraries(blake3-example-tbb PRIVATE blake3)
+    install(TARGETS blake3-example-tbb)
+  endif()
+endif()
--- a/external/blake3/cmake/BLAKE3/Testing.cmake
+++ b/external/blake3/cmake/BLAKE3/Testing.cmake
@@ -0,0 +1,3 @@
+if(BLAKE3_TESTING_CI)
+  include(BLAKE3/ContinuousIntegration)
+endif()
--- a/external/blake3/dependencies/CMakeLists.txt
+++ b/external/blake3/dependencies/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(BLAKE3_USE_TBB)
+    add_subdirectory(tbb)
+endif()
--- a/external/blake3/dependencies/tbb/CMakeLists.txt
+++ b/external/blake3/dependencies/tbb/CMakeLists.txt
@@ -0,0 +1,28 @@
+find_package(TBB 2021.11.0 QUIET)
+
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.11)
+  include(FetchContent)
+
+  if(NOT TBB_FOUND AND BLAKE3_FETCH_TBB)
+    set(CMAKE_C_STANDARD 99)
+    set(CMAKE_C_EXTENSIONS OFF)
+
+    set(CMAKE_CXX_STANDARD 20)
+    set(CMAKE_CXX_EXTENSIONS ON)
+
+    option(TBB_TEST OFF "")
+    option(TBBMALLOC_BUILD OFF "")
+
+    mark_as_advanced(TBB_TEST)
+    mark_as_advanced(TBBMALLOC_BUILD)
+
+    FetchContent_Declare(
+      TBB
+      GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB
+      GIT_TAG 0c0ff192a2304e114bc9e6557582dfba101360ff # v2022.0.0
+      GIT_SHALLOW TRUE
+    )
+
+    FetchContent_MakeAvailable(TBB)
+  endif()
+endif()
--- a/external/blake3/example.c
+++ b/external/blake3/example.c
@@ -0,0 +1,36 @@
+#include "blake3.h"
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(void) {
+  // Initialize the hasher.
+  blake3_hasher hasher;
+  blake3_hasher_init(&hasher);
+
+  // Read input bytes from stdin.
+  unsigned char buf[65536];
+  while (1) {
+    ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
+    if (n > 0) {
+      blake3_hasher_update(&hasher, buf, n);
+    } else if (n == 0) {
+      break; // end of file
+    } else {
+      fprintf(stderr, "read failed: %s\n", strerror(errno));
+      return 1;
+    }
+  }
+
+  // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
+  uint8_t output[BLAKE3_OUT_LEN];
+  blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+
+  // Print the hash as hexadecimal.
+  for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
+    printf("%02x", output[i]);
+  }
+  printf("\n");
+  return 0;
+}
--- a/external/blake3/example_tbb.c
+++ b/external/blake3/example_tbb.c
@@ -0,0 +1,57 @@
+#include "blake3.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+int main(int argc, char **argv) {
+  // For each filepath argument, memory map it and hash it.
+  for (int i = 1; i < argc; i++) {
+    // Open and memory map the file.
+    int fd = open(argv[i], O_RDONLY);
+    if (fd == -1) {
+      fprintf(stderr, "open failed: %s\n", strerror(errno));
+      return 1;
+    }
+    struct stat statbuf;
+    if (fstat(fd, &statbuf) == -1) {
+      fprintf(stderr, "stat failed: %s\n", strerror(errno));
+      return 1;
+    }
+    void *mapped = mmap(NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+    if (mapped == MAP_FAILED) {
+      fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+      return 1;
+    }
+
+    // Initialize the hasher.
+    blake3_hasher hasher;
+    blake3_hasher_init(&hasher);
+
+    // Hash the mapped file using multiple threads.
+    blake3_hasher_update_tbb(&hasher, mapped, statbuf.st_size);
+
+    // Unmap and close the file.
+    if (munmap(mapped, statbuf.st_size) == -1) {
+      fprintf(stderr, "munmap failed: %s\n", strerror(errno));
+      return 1;
+    }
+    if (close(fd) == -1) {
+      fprintf(stderr, "close failed: %s\n", strerror(errno));
+      return 1;
+    }
+
+    // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
+    uint8_t output[BLAKE3_OUT_LEN];
+    blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+
+    // Print the hash as hexadecimal.
+    for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
+      printf("%02x", output[i]);
+    }
+    printf("\n");
+  }
+}
--- a/external/blake3/libblake3.pc.in
+++ b/external/blake3/libblake3.pc.in
@@ -0,0 +1,12 @@
+prefix="@CMAKE_INSTALL_PREFIX@"
+exec_prefix="${prefix}"
+libdir="@PKG_CONFIG_INSTALL_LIBDIR@"
+includedir="@PKG_CONFIG_INSTALL_INCLUDEDIR@"
+
+Name: @PROJECT_NAME@
+Description: @PROJECT_DESCRIPTION@
+Version: @PROJECT_VERSION@
+
+Requires: @PKG_CONFIG_REQUIRES@
+Libs: -L"${libdir}" -lblake3 @PKG_CONFIG_LIBS@
+Cflags: -I"${includedir}" @PKG_CONFIG_CFLAGS@
--- a/external/blake3/main.c
+++ b/external/blake3/main.c
@@ -0,0 +1,166 @@
+/*
+ * This main file is intended for testing via `make test`. It does not build in
+ * other settings. See README.md in this directory for examples of how to build
+ * C code.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "blake3.h"
+#include "blake3_impl.h"
+
+#define HASH_MODE 0
+#define KEYED_HASH_MODE 1
+#define DERIVE_KEY_MODE 2
+
+static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) {
+  if ('0' <= c && c <= '9') {
+    *value = c - '0';
+    *valid = true;
+  } else if ('a' <= c && c <= 'f') {
+    *value = 10 + c - 'a';
+    *valid = true;
+  } else {
+    *valid = false;
+  }
+}
+
+static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) {
+  size_t hex_len = strlen(hex_key);
+  if (hex_len != 64) {
+    fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n",
+            hex_len);
+    return 1;
+  }
+  for (size_t i = 0; i < 64; i++) {
+    uint8_t value;
+    bool valid;
+    hex_char_value(hex_key[i], &value, &valid);
+    if (!valid) {
+      fprintf(stderr, "Invalid hex char.\n");
+      return 1;
+    }
+    if (i % 2 == 0) {
+      out[i / 2] = 0;
+      value <<= 4;
+    }
+    out[i / 2] += value;
+  }
+  return 0;
+}
+
+/* A little repetition here */
+enum cpu_feature {
+  SSE2 = 1 << 0,
+  SSSE3 = 1 << 1,
+  SSE41 = 1 << 2,
+  AVX = 1 << 3,
+  AVX2 = 1 << 4,
+  AVX512F = 1 << 5,
+  AVX512VL = 1 << 6,
+  /* ... */
+  UNDEFINED = 1 << 30
+};
+
+extern enum cpu_feature g_cpu_features;
+enum cpu_feature get_cpu_features(void);
+
+int main(int argc, char **argv) {
+  size_t out_len = BLAKE3_OUT_LEN;
+  uint8_t key[BLAKE3_KEY_LEN];
+  char *context = "";
+  uint8_t mode = HASH_MODE;
+  while (argc > 1) {
+    if (argc <= 2) {
+      fprintf(stderr, "Odd number of arguments.\n");
+      return 1;
+    }
+    if (strcmp("--length", argv[1]) == 0) {
+      char *endptr = NULL;
+      errno = 0;
+      unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10);
+      if (errno != 0 || out_len_ll > SIZE_MAX || endptr == argv[2] ||
+          *endptr != 0) {
+        fprintf(stderr, "Bad length argument.\n");
+        return 1;
+      }
+      out_len = (size_t)out_len_ll;
+    } else if (strcmp("--keyed", argv[1]) == 0) {
+      mode = KEYED_HASH_MODE;
+      int ret = parse_key(argv[2], key);
+      if (ret != 0) {
+        return ret;
+      }
+    } else if (strcmp("--derive-key", argv[1]) == 0) {
+      mode = DERIVE_KEY_MODE;
+      context = argv[2];
+    } else {
+      fprintf(stderr, "Unknown flag.\n");
+      return 1;
+    }
+    argc -= 2;
+    argv += 2;
+  }
+
+  /*
+   * We're going to hash the input multiple times, so we need to buffer it all.
+   * This is just for test cases, so go ahead and assume that the input is less
+   * than 1 MiB.
+   */
+  size_t buf_capacity = 1 << 20;
+  uint8_t *buf = malloc(buf_capacity);
+  assert(buf != NULL);
+  size_t buf_len = 0;
+  while (1) {
+    size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin);
+    if (n == 0) {
+      break;
+    }
+    buf_len += n;
+    assert(buf_len < buf_capacity);
+  }
+
+  const int mask = get_cpu_features();
+  int feature = 0;
+  do {
+    fprintf(stderr, "Testing 0x%08X\n", feature);
+    g_cpu_features = feature;
+    blake3_hasher hasher;
+    switch (mode) {
+    case HASH_MODE:
+      blake3_hasher_init(&hasher);
+      break;
+    case KEYED_HASH_MODE:
+      blake3_hasher_init_keyed(&hasher, key);
+      break;
+    case DERIVE_KEY_MODE:
+      blake3_hasher_init_derive_key(&hasher, context);
+      break;
+    default:
+      abort();
+    }
+
+    blake3_hasher_update(&hasher, buf, buf_len);
+
+    /* TODO: An incremental output reader API to avoid this allocation. */
+    uint8_t *out = malloc(out_len);
+    if (out_len > 0 && out == NULL) {
+      fprintf(stderr, "malloc() failed.\n");
+      return 1;
+    }
+    blake3_hasher_finalize(&hasher, out, out_len);
+    for (size_t i = 0; i < out_len; i++) {
+      printf("%02x", out[i]);
+    }
+    printf("\n");
+    free(out);
+    feature = (feature - mask) & mask;
+  } while (feature != 0);
+  free(buf);
+  return 0;
+}
--- a/external/blake3/test.py
+++ b/external/blake3/test.py
@@ -0,0 +1,97 @@
+#! /usr/bin/env python3
+
+from binascii import hexlify
+import json
+from os import path
+import subprocess
+
+HERE = path.dirname(__file__)
+TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json")
+TEST_VECTORS = json.load(open(TEST_VECTORS_PATH))
+
+
+def run_blake3(args, input):
+    output = subprocess.run([path.join(HERE, "blake3")] + args,
+                            input=input,
+                            stdout=subprocess.PIPE,
+                            check=True)
+    return output.stdout.decode().strip()
+
+
+# Fill the input with a repeating byte pattern. We use a cycle length of 251,
+# because that's the largest prime number less than 256. This makes it unlikely
+# to swapping any two adjacent input blocks or chunks will give the same
+# answer.
+def make_test_input(length):
+    i = 0
+    buf = bytearray()
+    while len(buf) < length:
+        buf.append(i)
+        i = (i + 1) % 251
+    return buf
+
+
+def main():
+    for case in TEST_VECTORS["cases"]:
+        input_len = case["input_len"]
+        input = make_test_input(input_len)
+        hex_key = hexlify(TEST_VECTORS["key"].encode())
+        context_string = TEST_VECTORS["context_string"]
+        expected_hash_xof = case["hash"]
+        expected_hash = expected_hash_xof[:64]
+        expected_keyed_hash_xof = case["keyed_hash"]
+        expected_keyed_hash = expected_keyed_hash_xof[:64]
+        expected_derive_key_xof = case["derive_key"]
+        expected_derive_key = expected_derive_key_xof[:64]
+
+        # Test the default hash.
+        test_hash = run_blake3([], input)
+        for line in test_hash.splitlines():
+            assert expected_hash == line, \
+                "hash({}): {} != {}".format(input_len, expected_hash, line)
+
+        # Test the extended hash.
+        xof_len = len(expected_hash_xof) // 2
+        test_hash_xof = run_blake3(["--length", str(xof_len)], input)
+        for line in test_hash_xof.splitlines():
+            assert expected_hash_xof == line, \
+                "hash_xof({}): {} != {}".format(
+                    input_len, expected_hash_xof, line)
+
+        # Test the default keyed hash.
+        test_keyed_hash = run_blake3(["--keyed", hex_key], input)
+        for line in test_keyed_hash.splitlines():
+            assert expected_keyed_hash == line, \
+                "keyed_hash({}): {} != {}".format(
+                    input_len, expected_keyed_hash, line)
+
+        # Test the extended keyed hash.
+        xof_len = len(expected_keyed_hash_xof) // 2
+        test_keyed_hash_xof = run_blake3(
+            ["--keyed", hex_key, "--length",
+             str(xof_len)], input)
+        for line in test_keyed_hash_xof.splitlines():
+            assert expected_keyed_hash_xof == line, \
+                "keyed_hash_xof({}): {} != {}".format(
+                    input_len, expected_keyed_hash_xof, line)
+
+        # Test the default derive key.
+        test_derive_key = run_blake3(["--derive-key", context_string], input)
+        for line in test_derive_key.splitlines():
+            assert expected_derive_key == line, \
+                "derive_key({}): {} != {}".format(
+                    input_len, expected_derive_key, line)
+
+        # Test the extended derive key.
+        xof_len = len(expected_derive_key_xof) // 2
+        test_derive_key_xof = run_blake3(
+            ["--derive-key", context_string, "--length",
+             str(xof_len)], input)
+        for line in test_derive_key_xof.splitlines():
+            assert expected_derive_key_xof == line, \
+                "derive_key_xof({}): {} != {}".format(
+                    input_len, expected_derive_key_xof, line)
+
+
+if __name__ == "__main__":
+    main()
--- a/external/rocksdb/conandata.yml
+++ b/external/rocksdb/conandata.yml
@@ -1,27 +1,12 @@
 sources:
-  "6.29.5":
-    url: "https://github.com/facebook/rocksdb/archive/refs/tags/v6.29.5.tar.gz"
-    sha256: "ddbf84791f0980c0bbce3902feb93a2c7006f6f53bfd798926143e31d4d756f0"
-  "6.27.3":
-    url: "https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz"
-    sha256: "ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58"
-  "6.20.3":
-    url: "https://github.com/facebook/rocksdb/archive/refs/tags/v6.20.3.tar.gz"
-    sha256: "c6502c7aae641b7e20fafa6c2b92273d935d2b7b2707135ebd9a67b092169dca"
-  "8.8.1":
-    url: "https://github.com/facebook/rocksdb/archive/refs/tags/v8.8.1.tar.gz"
-    sha256: "056c7e21ad8ae36b026ac3b94b9d6e0fcc60e1d937fc80330921e4181be5c36e"
+  "9.7.3":
+    url: "https://github.com/facebook/rocksdb/archive/refs/tags/v9.7.3.tar.gz"
+    sha256: "acfabb989cbfb5b5c4d23214819b059638193ec33dad2d88373c46448d16d38b"
 patches:
-  "6.29.5":
-    - patch_file: "patches/6.29.5-0001-add-include-cstdint-for-gcc-13.patch"
-      patch_description: "Fix build with gcc 13 by including cstdint"
-      patch_type: "portability"
-      patch_source: "https://github.com/facebook/rocksdb/pull/11118"
-    - patch_file: "patches/6.29.5-0002-exclude-thirdparty.patch"
+  "9.7.3":
+    - patch_file: "patches/9.x.x-0001-exclude-thirdparty.patch"
      patch_description: "Do not include thirdparty.inc"
      patch_type: "portability"
-  "6.27.3":
-    - patch_file: "patches/6.27.3-0001-add-include-cstdint-for-gcc-13.patch"
-      patch_description: "Fix build with gcc 13 by including cstdint"
+    - patch_file: "patches/9.7.3-0001-memory-leak.patch"
+      patch_description: "Fix a leak of obsolete blob files left open until DB::Close()"
      patch_type: "portability"
-      patch_source: "https://github.com/facebook/rocksdb/pull/11118"
--- a/external/rocksdb/conanfile.py
+++ b/external/rocksdb/conanfile.py
@@ -15,10 +15,10 @@ required_conan_version = ">=1.53.0"

 class RocksDBConan(ConanFile):
    name = "rocksdb"
-    homepage = "https://github.com/facebook/rocksdb"
+    description = "A library that provides an embeddable, persistent key-value store for fast storage"
    license = ("GPL-2.0-only", "Apache-2.0")
    url = "https://github.com/conan-io/conan-center-index"
-    description = "A library that provides an embeddable, persistent key-value store for fast storage"
+    homepage = "https://github.com/facebook/rocksdb"
    topics = ("database", "leveldb", "facebook", "key-value")
    package_type = "library"
    settings = "os", "arch", "compiler", "build_type"
@@ -58,12 +58,12 @@ class RocksDBConan(ConanFile):
    @property
    def _compilers_minimum_version(self):
        return {} if self._min_cppstd == "11" else {
-                "apple-clang": "10",
-                "clang": "7",
-                "gcc": "7",
-                "msvc": "191",
-                "Visual Studio": "15",
-            }
+            "apple-clang": "10",
+            "clang": "7",
+            "gcc": "7",
+            "msvc": "191",
+            "Visual Studio": "15",
+        }

    def export_sources(self):
        export_conandata_patches(self)
@@ -89,13 +89,13 @@ class RocksDBConan(ConanFile):
        if self.options.with_snappy:
            self.requires("snappy/1.1.10")
        if self.options.with_lz4:
-            self.requires("lz4/1.9.4")
+            self.requires("lz4/1.10.0")
        if self.options.with_zlib:
            self.requires("zlib/[>=1.2.11 <2]")
        if self.options.with_zstd:
-            self.requires("zstd/1.5.5")
+            self.requires("zstd/1.5.6")
        if self.options.get_safe("with_tbb"):
-            self.requires("onetbb/2021.10.0")
+            self.requires("onetbb/2021.12.0")
        if self.options.with_jemalloc:
            self.requires("jemalloc/5.3.0")

@@ -115,9 +115,9 @@ class RocksDBConan(ConanFile):
        check_min_vs(self, "191")

        if self.version == "6.20.3" and \
-           self.settings.os == "Linux" and \
-           self.settings.compiler == "gcc" and \
-           Version(self.settings.compiler.version) < "5":
+                self.settings.os == "Linux" and \
+                self.settings.compiler == "gcc" and \
+                Version(self.settings.compiler.version) < "5":
            raise ConanInvalidConfiguration("Rocksdb 6.20.3 is not compilable with gcc <5.") # See https://github.com/facebook/rocksdb/issues/3522

    def source(self):
@@ -163,6 +163,8 @@ class RocksDBConan(ConanFile):
        if self.options.with_jemalloc:
            deps.set_property("jemalloc", "cmake_file_name", "JeMalloc")
            deps.set_property("jemalloc", "cmake_target_name", "JeMalloc::JeMalloc")
+        if self.options.with_zstd:
+            deps.set_property("zstd", "cmake_target_name", "zstd::zstd")
        deps.generate()

    def build(self):
--- a/external/rocksdb/patches/6.29.5-0001-add-include-cstdint-for-gcc-13.patch
+++ b/external/rocksdb/patches/6.29.5-0001-add-include-cstdint-for-gcc-13.patch
@@ -1,30 +0,0 @@
--- a/include/rocksdb/utilities/checkpoint.h
-+++ b/include/rocksdb/utilities/checkpoint.h
-@@ -8,6 +8,7 @@
- #pragma once
- #ifndef ROCKSDB_LITE
- 
-+#include <cstdint>
- #include <string>
- #include <vector>
- #include "rocksdb/status.h"
--- a/table/block_based/data_block_hash_index.h
-+++ b/table/block_based/data_block_hash_index.h
-@@ -5,6 +5,7 @@
- 
- #pragma once
- 
-+#include <cstdint>
- #include <string>
- #include <vector>
- 
--- a/util/string_util.h
-+++ b/util/string_util.h
-@@ -6,6 +6,7 @@
- 
- #pragma once
- 
-+#include <cstdint>
- #include <sstream>
- #include <string>
- #include <unordered_map>
--- a/external/rocksdb/patches/6.29.5-0002-exclude-thirdparty.patch
+++ b/external/rocksdb/patches/6.29.5-0002-exclude-thirdparty.patch
@@ -1,16 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index ec59d4491..35577c998 100644
--- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -101 +100,0 @@ if(MSVC)
-  option(WITH_GFLAGS "build with GFlags" OFF)
-@@ -103,2 +102,2 @@ if(MSVC)
-  include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
-else()
-+endif()
-+
-@@ -117 +116 @@ else()
-  if(MINGW)
-+  if(MINGW OR MSVC)
-@@ -183 +181,0 @@ else()
-endif()
--- a/external/rocksdb/patches/9.7.3-0001-memory-leak.patch
+++ b/external/rocksdb/patches/9.7.3-0001-memory-leak.patch
@@ -0,0 +1,319 @@
+diff --git a/HISTORY.md b/HISTORY.md
+index 36d472229..05ad1a202 100644
+--- a/HISTORY.md
+++ b/HISTORY.md
+@@ -1,6 +1,10 @@
+ # Rocksdb Change Log
+ > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
+ 
+## 9.7.4 (10/31/2024)
+### Bug Fixes
+* Fix a leak of obsolete blob files left open until DB::Close(). This bug was introduced in version 9.4.0.
+
+ ## 9.7.3 (10/16/2024)
+ ### Behavior Changes
+ * OPTIONS file to be loaded by remote worker is now preserved so that it does not get purged by the primary host. A similar technique as how we are preserving new SST files from getting purged is used for this. min_options_file_numbers_ is tracked like pending_outputs_ is tracked.
+diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc
+index 5f340aadf..1b9faa238 100644
+--- a/db/blob/blob_file_cache.cc
+++ b/db/blob/blob_file_cache.cc
+@@ -42,6 +42,7 @@ Status BlobFileCache::GetBlobFileReader(
+   assert(blob_file_reader);
+   assert(blob_file_reader->IsEmpty());
+ 
+  // NOTE: sharing same Cache with table_cache
+   const Slice key = GetSliceForKey(&blob_file_number);
+ 
+   assert(cache_);
+@@ -98,4 +99,13 @@ Status BlobFileCache::GetBlobFileReader(
+   return Status::OK();
+ }
+ 
+void BlobFileCache::Evict(uint64_t blob_file_number) {
+  // NOTE: sharing same Cache with table_cache
+  const Slice key = GetSliceForKey(&blob_file_number);
+
+  assert(cache_);
+
+  cache_.get()->Erase(key);
+}
+
+ }  // namespace ROCKSDB_NAMESPACE
+diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h
+index 740e67ada..6858d012b 100644
+--- a/db/blob/blob_file_cache.h
+++ b/db/blob/blob_file_cache.h
+@@ -36,6 +36,15 @@ class BlobFileCache {
+                            uint64_t blob_file_number,
+                            CacheHandleGuard<BlobFileReader>* blob_file_reader);
+ 
+  // Called when a blob file is obsolete to ensure it is removed from the cache
+  // to avoid effectively leaking the open file and assicated memory
+  void Evict(uint64_t blob_file_number);
+
+  // Used to identify cache entries for blob files (not normally useful)
+  static const Cache::CacheItemHelper* GetHelper() {
+    return CacheInterface::GetBasicHelper();
+  }
+
+  private:
+   using CacheInterface =
+       BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>;
+diff --git a/db/column_family.h b/db/column_family.h
+index e4b7adde8..86637736a 100644
+--- a/db/column_family.h
+++ b/db/column_family.h
+@@ -401,6 +401,7 @@ class ColumnFamilyData {
+                          SequenceNumber earliest_seq);
+ 
+   TableCache* table_cache() const { return table_cache_.get(); }
+  BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
+   BlobSource* blob_source() const { return blob_source_.get(); }
+ 
+   // See documentation in compaction_picker.h
+diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
+index 261593423..06573ac2e 100644
+--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
+@@ -659,8 +659,9 @@ Status DBImpl::CloseHelper() {
+   // We need to release them before the block cache is destroyed. The block
+   // cache may be destroyed inside versions_.reset(), when column family data
+   // list is destroyed, so leaving handles in table cache after
+-  // versions_.reset() may cause issues.
+-  // Here we clean all unreferenced handles in table cache.
+  // versions_.reset() may cause issues. Here we clean all unreferenced handles
+  // in table cache, and (for certain builds/conditions) assert that no obsolete
+  // files are hanging around unreferenced (leak) in the table/blob file cache.
+   // Now we assume all user queries have finished, so only version set itself
+   // can possibly hold the blocks from block cache. After releasing unreferenced
+   // handles here, only handles held by version set left and inside
+@@ -668,6 +669,9 @@ Status DBImpl::CloseHelper() {
+   // time a handle is released, we erase it from the cache too. By doing that,
+   // we can guarantee that after versions_.reset(), table cache is empty
+   // so the cache can be safely destroyed.
+#ifndef NDEBUG
+  TEST_VerifyNoObsoleteFilesCached(/*db_mutex_already_held=*/true);
+#endif  // !NDEBUG
+   table_cache_->EraseUnRefEntries();
+ 
+   for (auto& txn_entry : recovered_transactions_) {
+@@ -3227,6 +3231,8 @@ Status DBImpl::MultiGetImpl(
+       s = Status::Aborted();
+       break;
+     }
+    // This could be a long-running operation
+    ROCKSDB_THREAD_YIELD_HOOK();
+   }
+ 
+   // Post processing (decrement reference counts and record statistics)
+diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
+index 5e4fa310b..ccc0abfa7 100644
+--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
+@@ -1241,9 +1241,14 @@ class DBImpl : public DB {
+   static Status TEST_ValidateOptions(const DBOptions& db_options) {
+     return ValidateOptions(db_options);
+   }
+-
+ #endif  // NDEBUG
+ 
+  // In certain configurations, verify that the table/blob file cache only
+  // contains entries for live files, to check for effective leaks of open
+  // files. This can only be called when purging of obsolete files has
+  // "settled," such as during parts of DB Close().
+  void TEST_VerifyNoObsoleteFilesCached(bool db_mutex_already_held) const;
+
+   // persist stats to column family "_persistent_stats"
+   void PersistStats();
+ 
+diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
+index 790a50d7a..67f5b4aaf 100644
+--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
+@@ -9,6 +9,7 @@
+ 
+ #ifndef NDEBUG
+ 
+#include "db/blob/blob_file_cache.h"
+ #include "db/column_family.h"
+ #include "db/db_impl/db_impl.h"
+ #include "db/error_handler.h"
+@@ -328,5 +329,49 @@ size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+   InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
+   return EstimateInMemoryStatsHistorySize();
+ }
+
+void DBImpl::TEST_VerifyNoObsoleteFilesCached(
+    bool db_mutex_already_held) const {
+  // This check is somewhat expensive and obscure to make a part of every
+  // unit test in every build variety. Thus, we only enable it for ASAN builds.
+  if (!kMustFreeHeapAllocations) {
+    return;
+  }
+
+  std::optional<InstrumentedMutexLock> l;
+  if (db_mutex_already_held) {
+    mutex_.AssertHeld();
+  } else {
+    l.emplace(&mutex_);
+  }
+
+  std::vector<uint64_t> live_files;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    // Sneakily add both SST and blob files to the same list
+    cfd->current()->AddLiveFiles(&live_files, &live_files);
+  }
+  std::sort(live_files.begin(), live_files.end());
+
+  auto fn = [&live_files](const Slice& key, Cache::ObjectPtr, size_t,
+                          const Cache::CacheItemHelper* helper) {
+    if (helper != BlobFileCache::GetHelper()) {
+      // Skip non-blob files for now
+      // FIXME: diagnose and fix the leaks of obsolete SST files revealed in
+      // unit tests.
+      return;
+    }
+    // See TableCache and BlobFileCache
+    assert(key.size() == sizeof(uint64_t));
+    uint64_t file_number;
+    GetUnaligned(reinterpret_cast<const uint64_t*>(key.data()), &file_number);
+    // Assert file is in sorted live_files
+    assert(
+        std::binary_search(live_files.begin(), live_files.end(), file_number));
+  };
+  table_cache_->ApplyToAllEntries(fn, {});
+}
+ }  // namespace ROCKSDB_NAMESPACE
+ #endif  // NDEBUG
+diff --git a/db/db_iter.cc b/db/db_iter.cc
+index e02586377..bf4749eb9 100644
+--- a/db/db_iter.cc
+++ b/db/db_iter.cc
+@@ -540,6 +540,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+     } else {
+       iter_.Next();
+     }
+    // This could be a long-running operation due to tombstones, etc.
+    ROCKSDB_THREAD_YIELD_HOOK();
+   } while (iter_.Valid());
+ 
+   valid_ = false;
+diff --git a/db/table_cache.cc b/db/table_cache.cc
+index 71fc29c32..8a5be75e8 100644
+--- a/db/table_cache.cc
+++ b/db/table_cache.cc
+@@ -164,6 +164,7 @@ Status TableCache::GetTableReader(
+ }
+ 
+ Cache::Handle* TableCache::Lookup(Cache* cache, uint64_t file_number) {
+  // NOTE: sharing same Cache with BlobFileCache
+   Slice key = GetSliceForFileNumber(&file_number);
+   return cache->Lookup(key);
+ }
+@@ -179,6 +180,7 @@ Status TableCache::FindTable(
+     size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+   PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
+   uint64_t number = file_meta.fd.GetNumber();
+  // NOTE: sharing same Cache with BlobFileCache
+   Slice key = GetSliceForFileNumber(&number);
+   *handle = cache_.Lookup(key);
+   TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
+diff --git a/db/version_builder.cc b/db/version_builder.cc
+index ed8ab8214..c98f53f42 100644
+--- a/db/version_builder.cc
+++ b/db/version_builder.cc
+@@ -24,6 +24,7 @@
+ #include <vector>
+ 
+ #include "cache/cache_reservation_manager.h"
+#include "db/blob/blob_file_cache.h"
+ #include "db/blob/blob_file_meta.h"
+ #include "db/dbformat.h"
+ #include "db/internal_stats.h"
+@@ -744,12 +745,9 @@ class VersionBuilder::Rep {
+       return Status::Corruption("VersionBuilder", oss.str());
+     }
+ 
+-    // Note: we use C++11 for now but in C++14, this could be done in a more
+-    // elegant way using generalized lambda capture.
+-    VersionSet* const vs = version_set_;
+-    const ImmutableCFOptions* const ioptions = ioptions_;
+-
+-    auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) {
+    auto deleter = [vs = version_set_, ioptions = ioptions_,
+                    bc = cfd_ ? cfd_->blob_file_cache()
+                              : nullptr](SharedBlobFileMetaData* shared_meta) {
+       if (vs) {
+         assert(ioptions);
+         assert(!ioptions->cf_paths.empty());
+@@ -758,6 +756,9 @@ class VersionBuilder::Rep {
+         vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(),
+                                 ioptions->cf_paths.front().path);
+       }
+      if (bc) {
+        bc->Evict(shared_meta->GetBlobFileNumber());
+      }
+ 
+       delete shared_meta;
+     };
+@@ -766,7 +767,7 @@ class VersionBuilder::Rep {
+         blob_file_number, blob_file_addition.GetTotalBlobCount(),
+         blob_file_addition.GetTotalBlobBytes(),
+         blob_file_addition.GetChecksumMethod(),
+-        blob_file_addition.GetChecksumValue(), deleter);
+        blob_file_addition.GetChecksumValue(), std::move(deleter));
+ 
+     mutable_blob_file_metas_.emplace(
+         blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
+diff --git a/db/version_set.h b/db/version_set.h
+index 9336782b1..024f869e7 100644
+--- a/db/version_set.h
+++ b/db/version_set.h
+@@ -1514,7 +1514,6 @@ class VersionSet {
+   void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+ 
+   void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
+-    // TODO: Erase file from BlobFileCache?
+     obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
+   }
+ 
+diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
+index 2a19796b8..0afa2cab1 100644
+--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
+@@ -13,7 +13,7 @@
+ // minor or major version number planned for release.
+ #define ROCKSDB_MAJOR 9
+ #define ROCKSDB_MINOR 7
+-#define ROCKSDB_PATCH 3
+#define ROCKSDB_PATCH 4
+ 
+ // Do not use these. We made the mistake of declaring macros starting with
+ // double underscore. Now we have to live with our choice. We'll deprecate these
+diff --git a/port/port.h b/port/port.h
+index 13aa56d47..141716e5b 100644
+--- a/port/port.h
+++ b/port/port.h
+@@ -19,3 +19,19 @@
+ #elif defined(OS_WIN)
+ #include "port/win/port_win.h"
+ #endif
+
+#ifdef OS_LINUX
+// A temporary hook into long-running RocksDB threads to support modifying their
+// priority etc. This should become a public API hook once the requirements
+// are better understood.
+extern "C" void RocksDbThreadYield() __attribute__((__weak__));
+#define ROCKSDB_THREAD_YIELD_HOOK() \
+  {                                 \
+    if (RocksDbThreadYield) {       \
+      RocksDbThreadYield();         \
+    }                               \
+  }
+#else
+#define ROCKSDB_THREAD_YIELD_HOOK() \
+  {}
+#endif
--- a/external/rocksdb/patches/9.x.x-0001-exclude-thirdparty.patch
+++ b/external/rocksdb/patches/9.x.x-0001-exclude-thirdparty.patch
@@ -0,0 +1,30 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 93b884d..b715cb6 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -106,14 +106,9 @@ endif()
+ include(CMakeDependentOption)
+
+ if(MSVC)
+-  option(WITH_GFLAGS "build with GFlags" OFF)
+   option(WITH_XPRESS "build with windows built in compression" OFF)
+-  option(ROCKSDB_SKIP_THIRDPARTY "skip thirdparty.inc" OFF)
+-
+-  if(NOT ROCKSDB_SKIP_THIRDPARTY)
+-    include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
+-  endif()
+-else()
+endif()
+if(TRUE)
+   if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
+     # FreeBSD has jemalloc as default malloc
+     # but it does not have all the jemalloc files in include/...
+@@ -126,7 +121,7 @@ else()
+     endif()
+   endif()
+
+-  if(MINGW)
+  if(MSVC OR MINGW)
+     option(WITH_GFLAGS "build with GFlags" OFF)
+   else()
+     option(WITH_GFLAGS "build with GFlags" ON)
--- a/external/secp256k1/.cirrus.yml
+++ b/external/secp256k1/.cirrus.yml
@@ -10,8 +10,8 @@ env:
  MAKEFLAGS: -j4
  BUILD: check
  ### secp256k1 config
-  ECMULTWINDOW: auto
-  ECMULTGENPRECISION: auto
+  ECMULTWINDOW: 15
+  ECMULTGENKB: 22
  ASM: no
  WIDEMUL: auto
  WITH_VALGRIND: yes
@@ -20,20 +20,18 @@ env:
  EXPERIMENTAL: no
  ECDH: no
  RECOVERY: no
+  EXTRAKEYS: no
  SCHNORRSIG: no
+  MUSIG: no
+  ELLSWIFT: no
  ### test options
-  SECP256K1_TEST_ITERS:
+  SECP256K1_TEST_ITERS: 64
  BENCH: yes
  SECP256K1_BENCH_ITERS: 2
  CTIMETESTS: yes
  # Compile and run the tests
  EXAMPLES: yes

-# https://cirrus-ci.org/pricing/#compute-credits
-credits_snippet: &CREDITS
-  # Don't use any credits for now.
-  use_compute_credits: false
-
 cat_logs_snippet: &CAT_LOGS
  always:
    cat_tests_log_script:
@@ -53,357 +51,51 @@ cat_logs_snippet: &CAT_LOGS
    cat_ci_env_script:
      - env

-merge_base_script_snippet: &MERGE_BASE
-  merge_base_script:
-    - if [ "$CIRRUS_PR" = "" ]; then exit 0; fi
-    - git fetch --depth=1 $CIRRUS_REPO_CLONE_URL "pull/${CIRRUS_PR}/merge"
-    - git checkout FETCH_HEAD  # Use merged changes to detect silent merge conflicts
-
-linux_container_snippet: &LINUX_CONTAINER
-  container:
-    dockerfile: ci/linux-debian.Dockerfile
-    # Reduce number of CPUs to be able to do more builds in parallel.
-    cpu: 1
-    # Gives us more CPUs for free if they're available.
-    greedy: true
-    # More than enough for our scripts.
-    memory: 1G
-
-task:
-  name: "x86_64: Linux (Debian stable)"
-  << : *LINUX_CONTAINER
-  matrix: &ENV_MATRIX
-    - env: {WIDEMUL:  int64,  RECOVERY: yes}
-    - env: {WIDEMUL:  int64,                 ECDH: yes, SCHNORRSIG: yes}
-    - env: {WIDEMUL: int128}
-    - env: {WIDEMUL: int128_struct}
-    - env: {WIDEMUL: int128,  RECOVERY: yes,            SCHNORRSIG: yes}
-    - env: {WIDEMUL: int128,                 ECDH: yes, SCHNORRSIG: yes}
-    - env: {WIDEMUL: int128,  ASM: x86_64}
-    - env: {                  RECOVERY: yes,            SCHNORRSIG: yes}
-    - env: {CTIMETESTS: no,    RECOVERY: yes, ECDH: yes, SCHNORRSIG: yes, CPPFLAGS: -DVERIFY}
-    - env: {BUILD: distcheck, WITH_VALGRIND: no, CTIMETESTS: no, BENCH: no}
-    - env: {CPPFLAGS: -DDETERMINISTIC}
-    - env: {CFLAGS: -O0, CTIMETESTS: no}
-    - env: { ECMULTGENPRECISION: 2, ECMULTWINDOW: 2 }
-    - env: { ECMULTGENPRECISION: 8, ECMULTWINDOW: 4 }
-  matrix:
-    - env:
-        CC: gcc
-    - env:
-        CC: clang
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  name: "i686: Linux (Debian stable)"
-  << : *LINUX_CONTAINER
-  env:
-    HOST: i686-linux-gnu
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-  matrix:
-    - env:
-        CC: i686-linux-gnu-gcc
-    - env:
-        CC: clang --target=i686-pc-linux-gnu -isystem /usr/i686-linux-gnu/include
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  name: "arm64: macOS Ventura"
-  macos_instance:
-    image: ghcr.io/cirruslabs/macos-ventura-base:latest
-  env:
-    HOMEBREW_NO_AUTO_UPDATE: 1
-    HOMEBREW_NO_INSTALL_CLEANUP: 1
-    # Cirrus gives us a fixed number of 4 virtual CPUs. Not that we even have that many jobs at the moment...
-    MAKEFLAGS: -j5
-  matrix:
-    << : *ENV_MATRIX
-  env:
-    ASM: no
-    WITH_VALGRIND: no
-    CTIMETESTS: no
-  matrix:
-    - env:
-        CC: gcc
-    - env:
-        CC: clang
-  brew_script:
-    - brew install automake libtool gcc
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-  << : *CREDITS
-
-task:
-  name: "s390x (big-endian): Linux (Debian stable, QEMU)"
-  << : *LINUX_CONTAINER
-  env:
-    WRAPPER_CMD: qemu-s390x
-    SECP256K1_TEST_ITERS: 16
-    HOST: s390x-linux-gnu
-    WITH_VALGRIND: no
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: no
-  << : *MERGE_BASE
-  test_script:
-    # https://sourceware.org/bugzilla/show_bug.cgi?id=27008
-    - rm /etc/ld.so.cache
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  name: "ARM32: Linux (Debian stable, QEMU)"
-  << : *LINUX_CONTAINER
-  env:
-    WRAPPER_CMD: qemu-arm
-    SECP256K1_TEST_ITERS: 16
-    HOST: arm-linux-gnueabihf
-    WITH_VALGRIND: no
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: no
-  matrix:
-    - env: {}
-    - env: {EXPERIMENTAL: yes, ASM: arm32}
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  name: "ARM64: Linux (Debian stable, QEMU)"
-  << : *LINUX_CONTAINER
-  env:
-    WRAPPER_CMD: qemu-aarch64
-    SECP256K1_TEST_ITERS: 16
-    HOST: aarch64-linux-gnu
-    WITH_VALGRIND: no
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: no
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  name: "ppc64le: Linux (Debian stable, QEMU)"
-  << : *LINUX_CONTAINER
-  env:
-    WRAPPER_CMD: qemu-ppc64le
-    SECP256K1_TEST_ITERS: 16
-    HOST: powerpc64le-linux-gnu
-    WITH_VALGRIND: no
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: no
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  << : *LINUX_CONTAINER
-  env:
-    WRAPPER_CMD: wine
-    WITH_VALGRIND: no
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: no
-  matrix:
-    - name: "x86_64 (mingw32-w64): Windows (Debian stable, Wine)"
-      env:
-        HOST: x86_64-w64-mingw32
-    - name: "i686 (mingw32-w64): Windows (Debian stable, Wine)"
-      env:
-        HOST: i686-w64-mingw32
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  << : *LINUX_CONTAINER
-  env:
-    WRAPPER_CMD: wine
-    WERROR_CFLAGS: -WX
-    WITH_VALGRIND: no
-    ECDH: yes
-    RECOVERY: yes
-    EXPERIMENTAL: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: no
-    # Use a MinGW-w64 host to tell ./configure we're building for Windows.
-    # This will detect some MinGW-w64 tools but then make will need only
-    # the MSVC tools CC, AR and NM as specified below.
-    HOST: x86_64-w64-mingw32
-    CC: /opt/msvc/bin/x64/cl
-    AR: /opt/msvc/bin/x64/lib
-    NM: /opt/msvc/bin/x64/dumpbin -symbols -headers
-    # Set non-essential options that affect the CLI messages here.
-    # (They depend on the user's taste, so we don't want to set them automatically in configure.ac.)
-    CFLAGS: -nologo -diagnostics:caret
-    LDFLAGS: -Xlinker -Xlinker -Xlinker -nologo
-  matrix:
-    - name: "x86_64 (MSVC): Windows (Debian stable, Wine)"
-    - name: "x86_64 (MSVC): Windows (Debian stable, Wine, int128_struct)"
-      env:
-        WIDEMUL: int128_struct
-    - name: "x86_64 (MSVC): Windows (Debian stable, Wine, int128_struct with __(u)mulh)"
-      env:
-        WIDEMUL: int128_struct
-        CPPFLAGS: -DSECP256K1_MSVC_MULH_TEST_OVERRIDE
-    - name: "i686 (MSVC): Windows (Debian stable, Wine)"
-      env:
-        HOST: i686-w64-mingw32
-        CC: /opt/msvc/bin/x86/cl
-        AR: /opt/msvc/bin/x86/lib
-        NM: /opt/msvc/bin/x86/dumpbin -symbols -headers
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-# Sanitizers
-task:
-  << : *LINUX_CONTAINER
-  env:
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: no
-  matrix:
-    - name: "Valgrind (memcheck)"
-      container:
-        cpu: 2
-      env:
-        # The `--error-exitcode` is required to make the test fail if valgrind found errors, otherwise it'll return 0 (https://www.valgrind.org/docs/manual/manual-core.html)
-        WRAPPER_CMD: "valgrind --error-exitcode=42"
-        SECP256K1_TEST_ITERS: 2
-    - name: "UBSan, ASan, LSan"
-      container:
-        memory: 2G
-      env:
-        CFLAGS: "-fsanitize=undefined,address -g"
-        UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1"
-        ASAN_OPTIONS: "strict_string_checks=1:detect_stack_use_after_return=1:detect_leaks=1"
-        LSAN_OPTIONS: "use_unaligned=1"
-        SECP256K1_TEST_ITERS: 32
-  # Try to cover many configurations with just a tiny matrix.
-  matrix:
-    - env:
-        ASM: auto
-    - env:
-        ASM: no
-        ECMULTGENPRECISION: 2
-        ECMULTWINDOW: 2
-  matrix:
-    - env:
-        CC: clang
-    - env:
-        HOST: i686-linux-gnu
-        CC: i686-linux-gnu-gcc
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-# Memory sanitizers
-task:
-  << : *LINUX_CONTAINER
-  name: "MSan"
-  env:
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-    CTIMETESTS: yes
-    CC: clang
-    SECP256K1_TEST_ITERS: 32
-    ASM: no
-    WITH_VALGRIND: no
-  container:
-    memory: 2G
-  matrix:
-    - env:
-        CFLAGS: "-fsanitize=memory -g"
-    - env:
-        ECMULTGENPRECISION: 2
-        ECMULTWINDOW: 2
-        CFLAGS: "-fsanitize=memory -g -O3"
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  name: "C++ -fpermissive (entire project)"
-  << : *LINUX_CONTAINER
-  env:
-    CC: g++
-    CFLAGS: -fpermissive -g
-    CPPFLAGS: -DSECP256K1_CPLUSPLUS_TEST_OVERRIDE
-    WERROR_CFLAGS:
-    ECDH: yes
-    RECOVERY: yes
-    SCHNORRSIG: yes
-  << : *MERGE_BASE
-  test_script:
-    - ./ci/cirrus.sh
-  << : *CAT_LOGS
-
-task:
-  name: "C++ (public headers)"
-  << : *LINUX_CONTAINER
-  test_script:
-    - g++ -Werror include/*.h
-    - clang -Werror -x c++-header include/*.h
-    - /opt/msvc/bin/x64/cl.exe -c -WX -TP include/*.h
-
-task:
-  name: "sage prover"
-  << : *LINUX_CONTAINER
-  test_script:
-    - cd sage
-    - sage prove_group_implementations.sage
-
-task:
-  name: "x86_64: Windows (VS 2022)"
-  windows_container:
-    image: cirrusci/windowsservercore:visualstudio2022
-    cpu: 4
-    memory: 3840MB
-  env:
-    PATH: '%CIRRUS_WORKING_DIR%\build\src\RelWithDebInfo;%PATH%'
-    x64_NATIVE_TOOLS: '"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat"'
-    # Ignore MSBuild warning MSB8029.
-    # See: https://learn.microsoft.com/en-us/visualstudio/msbuild/errors/msb8029?view=vs-2022
-    IgnoreWarnIntDirInTempDetected: 'true'
-  merge_script:
-    - PowerShell -NoLogo -Command if ($env:CIRRUS_PR -ne $null) { git fetch $env:CIRRUS_REPO_CLONE_URL pull/$env:CIRRUS_PR/merge; git reset --hard FETCH_HEAD; }
-  configure_script:
-    - '%x64_NATIVE_TOOLS%'
-    - cmake -E env CFLAGS="/WX" cmake -G "Visual Studio 17 2022" -A x64 -S . -B build -DSECP256K1_ENABLE_MODULE_RECOVERY=ON -DSECP256K1_BUILD_EXAMPLES=ON
+linux_arm64_container_snippet: &LINUX_ARM64_CONTAINER
+  env_script:
+    - env | tee /tmp/env
  build_script:
-    - '%x64_NATIVE_TOOLS%'
-    - cmake --build build --config RelWithDebInfo -- -property:UseMultiToolTask=true;CL_MPcount=5
-  check_script:
-    - '%x64_NATIVE_TOOLS%'
-    - ctest -C RelWithDebInfo --test-dir build -j 5
-    - build\src\RelWithDebInfo\bench_ecmult.exe
-    - build\src\RelWithDebInfo\bench_internal.exe
-    - build\src\RelWithDebInfo\bench.exe
+    - DOCKER_BUILDKIT=1 docker build --file "ci/linux-debian.Dockerfile" --tag="ci_secp256k1_arm"
+    - docker image prune --force  # Cleanup stale layers
+  test_script:
+    - docker run --rm --mount "type=bind,src=./,dst=/ci_secp256k1" --env-file /tmp/env --replace --name "ci_secp256k1_arm" "ci_secp256k1_arm" bash -c "cd /ci_secp256k1/ && ./ci/ci.sh"
+
+task:
+  name: "ARM64: Linux (Debian stable)"
+  persistent_worker:
+    labels:
+      type: arm64
+  env:
+    ECDH: yes
+    RECOVERY: yes
+    EXTRAKEYS: yes
+    SCHNORRSIG: yes
+    MUSIG: yes
+    ELLSWIFT: yes
+  matrix:
+     # Currently only gcc-snapshot, the other compilers are tested on GHA with QEMU
+     - env: { CC: 'gcc-snapshot' }
+  << : *LINUX_ARM64_CONTAINER
+  << : *CAT_LOGS
+
+task:
+  name: "ARM64: Linux (Debian stable), Valgrind"
+  persistent_worker:
+    labels:
+      type: arm64
+  env:
+    ECDH: yes
+    RECOVERY: yes
+    EXTRAKEYS: yes
+    SCHNORRSIG: yes
+    MUSIG: yes
+    ELLSWIFT: yes
+    WRAPPER_CMD: 'valgrind --error-exitcode=42'
+    SECP256K1_TEST_ITERS: 2
+  matrix:
+     - env: { CC: 'gcc' }
+     - env: { CC: 'clang' }
+     - env: { CC: 'gcc-snapshot' }
+     - env: { CC: 'clang-snapshot' }
+  << : *LINUX_ARM64_CONTAINER
+  << : *CAT_LOGS
--- a/external/secp256k1/.gitignore
+++ b/external/secp256k1/.gitignore
@@ -10,6 +10,8 @@ ctime_tests
 ecdh_example
 ecdsa_example
 schnorr_example
+ellswift_example
+musig_example
 *.exe
 *.so
 *.a
--- a/external/secp256k1/CHANGELOG.md
+++ b/external/secp256k1/CHANGELOG.md
@@ -5,6 +5,83 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.6.0] - 2024-11-04
+
+#### Added
+ - New module `musig` implements the MuSig2 multisignature scheme according to the [BIP 327 specification](https://github.com/bitcoin/bips/blob/master/bip-0327.mediawiki). See:
+   - Header file `include/secp256k1_musig.h` which defines the new API.
+   - Document `doc/musig.md` for further notes on API usage.
+   - Usage example `examples/musig.c`.
+ - New CMake variable `SECP256K1_APPEND_LDFLAGS` for appending linker flags to the build command.
+
+#### Changed
+ - API functions now use a significantly more robust method to clear secrets from the stack before returning. However, secret clearing remains a best-effort security measure and cannot guarantee complete removal.
+ - Any type `secp256k1_foo` can now be forward-declared using `typedef struct secp256k1_foo secp256k1_foo;` (or also `struct secp256k1_foo;` in C++).
+ - Organized CMake build artifacts into dedicated directories (`bin/` for executables, `lib/` for libraries) to improve build output structure and Windows shared library compatibility.
+
+#### Removed
+ - Removed the `secp256k1_scratch_space` struct and its associated functions `secp256k1_scratch_space_create` and `secp256k1_scratch_space_destroy` because the scratch space was unused in the API.
+
+#### ABI Compatibility
+The symbols `secp256k1_scratch_space_create` and `secp256k1_scratch_space_destroy` were removed.
+Otherwise, the library maintains backward compatibility with versions 0.3.x through 0.5.x.
+
+## [0.5.1] - 2024-08-01
+
+#### Added
+ - Added usage example for an ElligatorSwift key exchange.
+
+#### Changed
+ - The default size of the precomputed table for signing was changed from 22 KiB to 86 KiB. The size can be changed with the configure option `--ecmult-gen-kb` (`SECP256K1_ECMULT_GEN_KB` for CMake).
+ - "auto" is no longer an accepted value for the `--with-ecmult-window` and `--with-ecmult-gen-kb` configure options (this also applies to  `SECP256K1_ECMULT_WINDOW_SIZE` and `SECP256K1_ECMULT_GEN_KB` in CMake). To achieve the same configuration as previously provided by the "auto" value, omit setting the configure option explicitly.
+
+#### Fixed
+ - Fixed compilation when the extrakeys module is disabled.
+
+#### ABI Compatibility
+The ABI is backward compatible with versions 0.5.0, 0.4.x and 0.3.x.
+
+## [0.5.0] - 2024-05-06
+
+#### Added
+ - New function `secp256k1_ec_pubkey_sort` that sorts public keys using lexicographic (of compressed serialization) order.
+
+#### Changed
+ - The implementation of the point multiplication algorithm used for signing and public key generation was changed, resulting in improved performance for those operations.
+   - The related configure option `--ecmult-gen-precision` was replaced with `--ecmult-gen-kb` (`SECP256K1_ECMULT_GEN_KB` for CMake).
+   - This changes the supported precomputed table sizes for these operations. The new supported sizes are 2 KiB, 22 KiB, or 86 KiB (while the old supported sizes were 32 KiB, 64 KiB, or 512 KiB).
+
+#### ABI Compatibility
+The ABI is backward compatible with versions 0.4.x and 0.3.x.
+
+## [0.4.1] - 2023-12-21
+
+#### Changed
+ - The point multiplication algorithm used for ECDH operations (module `ecdh`) was replaced with a slightly faster one.
+ - Optional handwritten x86_64 assembly for field operations was removed because modern C compilers are able to output more efficient assembly. This change results in a significant speedup of some library functions when handwritten x86_64 assembly is enabled (`--with-asm=x86_64` in GNU Autotools, `-DSECP256K1_ASM=x86_64` in CMake), which is the default on x86_64. Benchmarks with GCC 10.5.0 show a 10% speedup for `secp256k1_ecdsa_verify` and `secp256k1_schnorrsig_verify`.
+
+#### ABI Compatibility
+The ABI is backward compatible with versions 0.4.0 and 0.3.x.
+
+## [0.4.0] - 2023-09-04
+
+#### Added
+ - New module `ellswift` implements ElligatorSwift encoding for public keys and x-only Diffie-Hellman key exchange for them.
+   ElligatorSwift permits representing secp256k1 public keys as 64-byte arrays which cannot be distinguished from uniformly random. See:
+   - Header file `include/secp256k1_ellswift.h` which defines the new API.
+   - Document `doc/ellswift.md` which explains the mathematical background of the scheme.
+   - The [paper](https://eprint.iacr.org/2022/759) on which the scheme is based.
+ - We now test the library with unreleased development snapshots of GCC and Clang. This gives us an early chance to catch miscompilations and constant-time issues introduced by the compiler (such as those that led to the previous two releases).
+
+#### Fixed
+ - Fixed symbol visibility in Windows DLL builds, where three internal library symbols were wrongly exported.
+
+#### Changed
+ - When consuming libsecp256k1 as a static library on Windows, the user must now define the `SECP256K1_STATIC` macro before including `secp256k1.h`.
+
+#### ABI Compatibility
+This release is backward compatible with the ABI of 0.3.0, 0.3.1, and 0.3.2. Symbol visibility is now believed to be handled properly on supported platforms and is now considered to be part of the ABI. Please report any improperly exported symbols as a bug.
+
 ## [0.3.2] - 2023-05-13
 We strongly recommend updating to 0.3.2 if you use or plan to use GCC >=13 to compile libsecp256k1. When in doubt, check the GCC version using `gcc -v`.

@@ -85,7 +162,11 @@ This version was in fact never released.
 The number was given by the build system since the introduction of autotools in Jan 2014 (ea0fe5a5bf0c04f9cc955b2966b614f5f378c6f6).
 Therefore, this version number does not uniquely identify a set of source files.

-[unreleased]: https://github.com/bitcoin-core/secp256k1/compare/v0.3.2...HEAD
+[0.6.0]: https://github.com/bitcoin-core/secp256k1/compare/v0.5.1...v0.6.0
+[0.5.1]: https://github.com/bitcoin-core/secp256k1/compare/v0.5.0...v0.5.1
+[0.5.0]: https://github.com/bitcoin-core/secp256k1/compare/v0.4.1...v0.5.0
+[0.4.1]: https://github.com/bitcoin-core/secp256k1/compare/v0.4.0...v0.4.1
+[0.4.0]: https://github.com/bitcoin-core/secp256k1/compare/v0.3.2...v0.4.0
 [0.3.2]: https://github.com/bitcoin-core/secp256k1/compare/v0.3.1...v0.3.2
 [0.3.1]: https://github.com/bitcoin-core/secp256k1/compare/v0.3.0...v0.3.1
 [0.3.0]: https://github.com/bitcoin-core/secp256k1/compare/v0.2.0...v0.3.0
--- a/external/secp256k1/CMakeLists.txt
+++ b/external/secp256k1/CMakeLists.txt
@@ -1,32 +1,29 @@
-cmake_minimum_required(VERSION 3.13)
-
-if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.15)
-  # MSVC runtime library flags are selected by the CMAKE_MSVC_RUNTIME_LIBRARY abstraction.
-  cmake_policy(SET CMP0091 NEW)
-  # MSVC warning flags are not in CMAKE_<LANG>_FLAGS by default.
-  cmake_policy(SET CMP0092 NEW)
-endif()
+cmake_minimum_required(VERSION 3.16)

+#=============================
+# Project / Package metadata
+#=============================
 project(libsecp256k1
  # The package (a.k.a. release) version is based on semantic versioning 2.0.0 of
  # the API. All changes in experimental modules are treated as
  # backwards-compatible and therefore at most increase the minor version.
-  VERSION 0.3.2
+  VERSION 0.6.0
  DESCRIPTION "Optimized C library for ECDSA signatures and secret/public key operations on curve secp256k1."
  HOMEPAGE_URL "https://github.com/bitcoin-core/secp256k1"
  LANGUAGES C
 )
+enable_testing()
+list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

 if(CMAKE_VERSION VERSION_LESS 3.21)
-  get_directory_property(parent_directory PARENT_DIRECTORY)
-  if(parent_directory)
-    set(PROJECT_IS_TOP_LEVEL OFF CACHE INTERNAL "Emulates CMake 3.21+ behavior.")
-    set(${PROJECT_NAME}_IS_TOP_LEVEL OFF CACHE INTERNAL "Emulates CMake 3.21+ behavior.")
+  # Emulates CMake 3.21+ behavior.
+  if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(PROJECT_IS_TOP_LEVEL ON)
+    set(${PROJECT_NAME}_IS_TOP_LEVEL ON)
  else()
-    set(PROJECT_IS_TOP_LEVEL ON CACHE INTERNAL "Emulates CMake 3.21+ behavior.")
-    set(${PROJECT_NAME}_IS_TOP_LEVEL ON CACHE INTERNAL "Emulates CMake 3.21+ behavior.")
+    set(PROJECT_IS_TOP_LEVEL OFF)
+    set(${PROJECT_NAME}_IS_TOP_LEVEL OFF)
  endif()
-  unset(parent_directory)
 endif()

 # The library version is based on libtool versioning of the ABI. The set of
@@ -34,15 +31,19 @@ endif()
 # https://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html
 # All changes in experimental modules are treated as if they don't affect the
 # interface and therefore only increase the revision.
-set(${PROJECT_NAME}_LIB_VERSION_CURRENT 2)
-set(${PROJECT_NAME}_LIB_VERSION_REVISION 2)
+set(${PROJECT_NAME}_LIB_VERSION_CURRENT 5)
+set(${PROJECT_NAME}_LIB_VERSION_REVISION 0)
 set(${PROJECT_NAME}_LIB_VERSION_AGE 0)

+#=============================
+# Language setup
+#=============================
 set(CMAKE_C_STANDARD 90)
 set(CMAKE_C_EXTENSIONS OFF)

-list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
-
+#=============================
+# Configurable options
+#=============================
 option(BUILD_SHARED_LIBS "Build shared libraries." ON)
 option(SECP256K1_DISABLE_SHARED "Disable shared library. Overrides BUILD_SHARED_LIBS." OFF)
 if(SECP256K1_DISABLE_SHARED)
@@ -51,24 +52,49 @@ endif()

 option(SECP256K1_INSTALL "Enable installation." ${PROJECT_IS_TOP_LEVEL})

+## Modules
+
+# We declare all options before processing them, to make sure we can express
+# dependendencies while processing.
 option(SECP256K1_ENABLE_MODULE_ECDH "Enable ECDH module." ON)
-if(SECP256K1_ENABLE_MODULE_ECDH)
-  add_compile_definitions(ENABLE_MODULE_ECDH=1)
+option(SECP256K1_ENABLE_MODULE_RECOVERY "Enable ECDSA pubkey recovery module." OFF)
+option(SECP256K1_ENABLE_MODULE_EXTRAKEYS "Enable extrakeys module." ON)
+option(SECP256K1_ENABLE_MODULE_SCHNORRSIG "Enable schnorrsig module." ON)
+option(SECP256K1_ENABLE_MODULE_MUSIG "Enable musig module." ON)
+option(SECP256K1_ENABLE_MODULE_ELLSWIFT "Enable ElligatorSwift module." ON)
+
+# Processing must be done in a topological sorting of the dependency graph
+# (dependent module first).
+if(SECP256K1_ENABLE_MODULE_ELLSWIFT)
+  add_compile_definitions(ENABLE_MODULE_ELLSWIFT=1)
+endif()
+
+if(SECP256K1_ENABLE_MODULE_MUSIG)
+  if(DEFINED SECP256K1_ENABLE_MODULE_SCHNORRSIG AND NOT SECP256K1_ENABLE_MODULE_SCHNORRSIG)
+    message(FATAL_ERROR "Module dependency error: You have disabled the schnorrsig module explicitly, but it is required by the musig module.")
+  endif()
+  set(SECP256K1_ENABLE_MODULE_SCHNORRSIG ON)
+  add_compile_definitions(ENABLE_MODULE_MUSIG=1)
+endif()
+
+if(SECP256K1_ENABLE_MODULE_SCHNORRSIG)
+  if(DEFINED SECP256K1_ENABLE_MODULE_EXTRAKEYS AND NOT SECP256K1_ENABLE_MODULE_EXTRAKEYS)
+    message(FATAL_ERROR "Module dependency error: You have disabled the extrakeys module explicitly, but it is required by the schnorrsig module.")
+  endif()
+  set(SECP256K1_ENABLE_MODULE_EXTRAKEYS ON)
+  add_compile_definitions(ENABLE_MODULE_SCHNORRSIG=1)
+endif()
+
+if(SECP256K1_ENABLE_MODULE_EXTRAKEYS)
+  add_compile_definitions(ENABLE_MODULE_EXTRAKEYS=1)
 endif()

-option(SECP256K1_ENABLE_MODULE_RECOVERY "Enable ECDSA pubkey recovery module." OFF)
 if(SECP256K1_ENABLE_MODULE_RECOVERY)
  add_compile_definitions(ENABLE_MODULE_RECOVERY=1)
 endif()

-option(SECP256K1_ENABLE_MODULE_EXTRAKEYS "Enable extrakeys module." ON)
-option(SECP256K1_ENABLE_MODULE_SCHNORRSIG "Enable schnorrsig module." ON)
-if(SECP256K1_ENABLE_MODULE_SCHNORRSIG)
-  set(SECP256K1_ENABLE_MODULE_EXTRAKEYS ON)
-  add_compile_definitions(ENABLE_MODULE_SCHNORRSIG=1)
-endif()
-if(SECP256K1_ENABLE_MODULE_EXTRAKEYS)
-  add_compile_definitions(ENABLE_MODULE_EXTRAKEYS=1)
+if(SECP256K1_ENABLE_MODULE_ECDH)
+  add_compile_definitions(ENABLE_MODULE_ECDH=1)
 endif()

 option(SECP256K1_USE_EXTERNAL_DEFAULT_CALLBACKS "Enable external default callback functions." OFF)
@@ -76,22 +102,25 @@ if(SECP256K1_USE_EXTERNAL_DEFAULT_CALLBACKS)
  add_compile_definitions(USE_EXTERNAL_DEFAULT_CALLBACKS=1)
 endif()

-set(SECP256K1_ECMULT_WINDOW_SIZE "AUTO" CACHE STRING "Window size for ecmult precomputation for verification, specified as integer in range [2..24]. \"AUTO\" is a reasonable setting for desktop machines (currently 15). [default=AUTO]")
-set_property(CACHE SECP256K1_ECMULT_WINDOW_SIZE PROPERTY STRINGS "AUTO" 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24)
+set(SECP256K1_ECMULT_WINDOW_SIZE 15 CACHE STRING "Window size for ecmult precomputation for verification, specified as integer in range [2..24]. The default value is a reasonable setting for desktop machines (currently 15). [default=15]")
+set_property(CACHE SECP256K1_ECMULT_WINDOW_SIZE PROPERTY STRINGS 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24)
 include(CheckStringOptionValue)
 check_string_option_value(SECP256K1_ECMULT_WINDOW_SIZE)
-if(SECP256K1_ECMULT_WINDOW_SIZE STREQUAL "AUTO")
-  set(SECP256K1_ECMULT_WINDOW_SIZE 15)
-endif()
 add_compile_definitions(ECMULT_WINDOW_SIZE=${SECP256K1_ECMULT_WINDOW_SIZE})

-set(SECP256K1_ECMULT_GEN_PREC_BITS "AUTO" CACHE STRING "Precision bits to tune the precomputed table size for signing, specified as integer 2, 4 or 8. \"AUTO\" is a reasonable setting for desktop machines (currently 4). [default=AUTO]")
-set_property(CACHE SECP256K1_ECMULT_GEN_PREC_BITS PROPERTY STRINGS "AUTO" 2 4 8)
-check_string_option_value(SECP256K1_ECMULT_GEN_PREC_BITS)
-if(SECP256K1_ECMULT_GEN_PREC_BITS STREQUAL "AUTO")
-  set(SECP256K1_ECMULT_GEN_PREC_BITS 4)
+set(SECP256K1_ECMULT_GEN_KB 86 CACHE STRING "The size of the precomputed table for signing in multiples of 1024 bytes (on typical platforms). Larger values result in possibly better signing or key generation performance at the cost of a larger table. Valid choices are 2, 22, 86. The default value is a reasonable setting for desktop machines (currently 86). [default=86]")
+set_property(CACHE SECP256K1_ECMULT_GEN_KB PROPERTY STRINGS 2 22 86)
+check_string_option_value(SECP256K1_ECMULT_GEN_KB)
+if(SECP256K1_ECMULT_GEN_KB EQUAL 2)
+  add_compile_definitions(COMB_BLOCKS=2)
+  add_compile_definitions(COMB_TEETH=5)
+elseif(SECP256K1_ECMULT_GEN_KB EQUAL 22)
+  add_compile_definitions(COMB_BLOCKS=11)
+  add_compile_definitions(COMB_TEETH=6)
+elseif(SECP256K1_ECMULT_GEN_KB EQUAL 86)
+  add_compile_definitions(COMB_BLOCKS=43)
+  add_compile_definitions(COMB_TEETH=6)
 endif()
-add_compile_definitions(ECMULT_GEN_PREC_BITS=${SECP256K1_ECMULT_GEN_PREC_BITS})

 set(SECP256K1_TEST_OVERRIDE_WIDE_MULTIPLY "OFF" CACHE STRING "Test-only override of the (autodetected by the C code) \"widemul\" setting. Legal values are: \"OFF\", \"int128_struct\", \"int128\" or \"int64\". [default=OFF]")
 set_property(CACHE SECP256K1_TEST_OVERRIDE_WIDE_MULTIPLY PROPERTY STRINGS "OFF" "int128_struct" "int128" "int64")
@@ -102,7 +131,7 @@ if(SECP256K1_TEST_OVERRIDE_WIDE_MULTIPLY)
 endif()
 mark_as_advanced(FORCE SECP256K1_TEST_OVERRIDE_WIDE_MULTIPLY)

-set(SECP256K1_ASM "AUTO" CACHE STRING "Assembly optimizations to use: \"AUTO\", \"OFF\", \"x86_64\" or \"arm32\" (experimental). [default=AUTO]")
+set(SECP256K1_ASM "AUTO" CACHE STRING "Assembly to use: \"AUTO\", \"OFF\", \"x86_64\" or \"arm32\" (experimental). [default=AUTO]")
 set_property(CACHE SECP256K1_ASM PROPERTY STRINGS "AUTO" "OFF" "x86_64" "arm32")
 check_string_option_value(SECP256K1_ASM)
 if(SECP256K1_ASM STREQUAL "arm32")
@@ -112,7 +141,7 @@ if(SECP256K1_ASM STREQUAL "arm32")
  if(HAVE_ARM32_ASM)
    add_compile_definitions(USE_EXTERNAL_ASM=1)
  else()
-    message(FATAL_ERROR "ARM32 assembly optimization requested but not available.")
+    message(FATAL_ERROR "ARM32 assembly requested but not available.")
  endif()
 elseif(SECP256K1_ASM)
  include(CheckX86_64Assembly)
@@ -123,14 +152,14 @@ elseif(SECP256K1_ASM)
  elseif(SECP256K1_ASM STREQUAL "AUTO")
    set(SECP256K1_ASM "OFF")
  else()
-    message(FATAL_ERROR "x86_64 assembly optimization requested but not available.")
+    message(FATAL_ERROR "x86_64 assembly requested but not available.")
  endif()
 endif()

 option(SECP256K1_EXPERIMENTAL "Allow experimental configuration options." OFF)
 if(NOT SECP256K1_EXPERIMENTAL)
  if(SECP256K1_ASM STREQUAL "arm32")
-    message(FATAL_ERROR "ARM32 assembly optimization is experimental. Use -DSECP256K1_EXPERIMENTAL=ON to allow.")
+    message(FATAL_ERROR "ARM32 assembly is experimental. Use -DSECP256K1_EXPERIMENTAL=ON to allow.")
  endif()
 endif()

@@ -167,7 +196,7 @@ else()
  string(REGEX REPLACE "-DNDEBUG[ \t\r\n]*" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
  string(REGEX REPLACE "-DNDEBUG[ \t\r\n]*" "" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
  # Prefer -O2 optimization level. (-O3 is CMake's default for Release for many compilers.)
-  string(REGEX REPLACE "-O3[ \t\r\n]*" "-O2" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+  string(REGEX REPLACE "-O3( |$)" "-O2\\1" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
 endif()

 # Define custom "Coverage" build type.
@@ -189,31 +218,37 @@ mark_as_advanced(
  CMAKE_SHARED_LINKER_FLAGS_COVERAGE
 )

-get_property(is_multi_config GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
-set(default_build_type "RelWithDebInfo")
-if(is_multi_config)
-  set(CMAKE_CONFIGURATION_TYPES "${default_build_type}" "Release" "Debug" "MinSizeRel" "Coverage" CACHE STRING
-    "Supported configuration types."
-    FORCE
-  )
-else()
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY
-    STRINGS "${default_build_type}" "Release" "Debug" "MinSizeRel" "Coverage"
-  )
-  if(NOT CMAKE_BUILD_TYPE)
-    message(STATUS "Setting build type to \"${default_build_type}\" as none was specified")
-    set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING
-      "Choose the type of build."
+if(PROJECT_IS_TOP_LEVEL)
+  get_property(is_multi_config GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+  set(default_build_type "RelWithDebInfo")
+  if(is_multi_config)
+    set(CMAKE_CONFIGURATION_TYPES "${default_build_type}" "Release" "Debug" "MinSizeRel" "Coverage" CACHE STRING
+      "Supported configuration types."
      FORCE
    )
+  else()
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY
+      STRINGS "${default_build_type}" "Release" "Debug" "MinSizeRel" "Coverage"
+    )
+    if(NOT CMAKE_BUILD_TYPE)
+      message(STATUS "Setting build type to \"${default_build_type}\" as none was specified")
+      set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING
+        "Choose the type of build."
+        FORCE
+      )
+    endif()
  endif()
 endif()

 include(TryAppendCFlags)
 if(MSVC)
  # Keep the following commands ordered lexicographically.
-  try_append_c_flags(/W2) # Moderate warning level.
+  try_append_c_flags(/W3) # Production quality warning level.
  try_append_c_flags(/wd4146) # Disable warning C4146 "unary minus operator applied to unsigned type, result still unsigned".
+  try_append_c_flags(/wd4244) # Disable warning C4244 "'conversion' conversion from 'type1' to 'type2', possible loss of data".
+  try_append_c_flags(/wd4267) # Disable warning C4267 "'var' : conversion from 'size_t' to 'type', possible loss of data".
+  # Eliminate deprecation warnings for the older, less secure functions.
+  add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 else()
  # Keep the following commands ordered lexicographically.
  try_append_c_flags(-pedantic)
@@ -234,17 +269,41 @@ endif()

 set(CMAKE_C_VISIBILITY_PRESET hidden)

-# Ask CTest to create a "check" target (e.g., make check) as alias for the "test" target.
-# CTEST_TEST_TARGET_ALIAS is not documented but supposed to be user-facing.
-# See: https://gitlab.kitware.com/cmake/cmake/-/commit/816c9d1aa1f2b42d40c81a991b68c96eb12b6d2
-set(CTEST_TEST_TARGET_ALIAS check)
-include(CTest)
-# We do not use CTest's BUILD_TESTING because a single toggle for all tests is too coarse for our needs.
-mark_as_advanced(BUILD_TESTING)
-if(SECP256K1_BUILD_BENCHMARK OR SECP256K1_BUILD_TESTS OR SECP256K1_BUILD_EXHAUSTIVE_TESTS OR SECP256K1_BUILD_CTIME_TESTS OR SECP256K1_BUILD_EXAMPLES)
-  enable_testing()
+set(print_msan_notice)
+if(SECP256K1_BUILD_CTIME_TESTS)
+  include(CheckMemorySanitizer)
+  check_memory_sanitizer(msan_enabled)
+  if(msan_enabled)
+    try_append_c_flags(-fno-sanitize-memory-param-retval)
+    set(print_msan_notice YES)
+  endif()
+  unset(msan_enabled)
 endif()

+set(SECP256K1_APPEND_CFLAGS "" CACHE STRING "Compiler flags that are appended to the command line after all other flags added by the build system. This variable is intended for debugging and special builds.")
+if(SECP256K1_APPEND_CFLAGS)
+  # Appending to this low-level rule variable is the only way to
+  # guarantee that the flags appear at the end of the command line.
+  string(APPEND CMAKE_C_COMPILE_OBJECT " ${SECP256K1_APPEND_CFLAGS}")
+endif()
+
+set(SECP256K1_APPEND_LDFLAGS "" CACHE STRING "Linker flags that are appended to the command line after all other flags added by the build system. This variable is intended for debugging and special builds.")
+if(SECP256K1_APPEND_LDFLAGS)
+  # Appending to this low-level rule variable is the only way to
+  # guarantee that the flags appear at the end of the command line.
+  string(APPEND CMAKE_C_CREATE_SHARED_LIBRARY " ${SECP256K1_APPEND_LDFLAGS}")
+  string(APPEND CMAKE_C_LINK_EXECUTABLE " ${SECP256K1_APPEND_LDFLAGS}")
+endif()
+
+if(NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
+  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
+endif()
+if(NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY)
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+endif()
+if(NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
+  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+endif()
 add_subdirectory(src)
 if(SECP256K1_BUILD_EXAMPLES)
  add_subdirectory(examples)
@@ -266,11 +325,13 @@ message("  ECDH ................................ ${SECP256K1_ENABLE_MODULE_ECDH}
 message("  ECDSA pubkey recovery ............... ${SECP256K1_ENABLE_MODULE_RECOVERY}")
 message("  extrakeys ........................... ${SECP256K1_ENABLE_MODULE_EXTRAKEYS}")
 message("  schnorrsig .......................... ${SECP256K1_ENABLE_MODULE_SCHNORRSIG}")
+message("  musig ............................... ${SECP256K1_ENABLE_MODULE_MUSIG}")
+message("  ElligatorSwift ...................... ${SECP256K1_ENABLE_MODULE_ELLSWIFT}")
 message("Parameters:")
 message("  ecmult window size .................. ${SECP256K1_ECMULT_WINDOW_SIZE}")
-message("  ecmult gen precision bits ........... ${SECP256K1_ECMULT_GEN_PREC_BITS}")
+message("  ecmult gen table size ............... ${SECP256K1_ECMULT_GEN_KB} KiB")
 message("Optional features:")
-message("  assembly optimization ............... ${SECP256K1_ASM}")
+message("  assembly ............................ ${SECP256K1_ASM}")
 message("  external callbacks .................. ${SECP256K1_USE_EXTERNAL_DEFAULT_CALLBACKS}")
 if(SECP256K1_TEST_OVERRIDE_WIDE_MULTIPLY)
  message("  wide multiplication (test-only) ..... ${SECP256K1_TEST_OVERRIDE_WIDE_MULTIPLY}")
@@ -297,7 +358,7 @@ message("Valgrind .............................. ${SECP256K1_VALGRIND}")
 get_directory_property(definitions COMPILE_DEFINITIONS)
 string(REPLACE ";" " " definitions "${definitions}")
 message("Preprocessor defined macros ........... ${definitions}")
-message("C compiler ............................ ${CMAKE_C_COMPILER}")
+message("C compiler ............................ ${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}, ${CMAKE_C_COMPILER}")
 message("CFLAGS ................................ ${CMAKE_C_FLAGS}")
 get_directory_property(compile_options COMPILE_OPTIONS)
 string(REPLACE ";" " " compile_options "${compile_options}")
@@ -320,7 +381,20 @@ else()
  message(" - LDFLAGS for executables ............ ${CMAKE_EXE_LINKER_FLAGS_DEBUG}")
  message(" - LDFLAGS for shared libraries ....... ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}")
 endif()
-message("\n")
+if(SECP256K1_APPEND_CFLAGS)
+  message("SECP256K1_APPEND_CFLAGS ............... ${SECP256K1_APPEND_CFLAGS}")
+endif()
+if(SECP256K1_APPEND_LDFLAGS)
+  message("SECP256K1_APPEND_LDFLAGS .............. ${SECP256K1_APPEND_LDFLAGS}")
+endif()
+message("")
+if(print_msan_notice)
+  message(
+    "Note:\n"
+    "  MemorySanitizer detected, tried to add -fno-sanitize-memory-param-retval to compile options\n"
+    "  to avoid false positives in ctime_tests. Pass -DSECP256K1_BUILD_CTIME_TESTS=OFF to avoid this.\n"
+  )
+endif()
 if(SECP256K1_EXPERIMENTAL)
  message(
    "  ******\n"
--- a/external/secp256k1/CONTRIBUTING.md
+++ b/external/secp256k1/CONTRIBUTING.md
@@ -0,0 +1,108 @@
+# Contributing to libsecp256k1
+
+## Scope
+
+libsecp256k1 is a library for elliptic curve cryptography on the curve secp256k1, not a general-purpose cryptography library.
+The library primarily serves the needs of the Bitcoin Core project but provides additional functionality for the benefit of the wider Bitcoin ecosystem.
+
+## Adding new functionality or modules
+
+The libsecp256k1 project welcomes contributions in the form of new functionality or modules, provided they are within the project's scope.
+
+It is the responsibility of the contributors to convince the maintainers that the proposed functionality is within the project's scope, high-quality and maintainable.
+Contributors are recommended to provide the following in addition to the new code:
+
+* **Specification:**
+    A specification can help significantly in reviewing the new code as it provides documentation and context.
+    It may justify various design decisions, give a motivation and outline security goals.
+    If the specification contains pseudocode, a reference implementation or test vectors, these can be used to compare with the proposed libsecp256k1 code.
+* **Security Arguments:**
+    In addition to a defining the security goals, it should be argued that the new functionality meets these goals.
+    Depending on the nature of the new functionality, a wide range of security arguments are acceptable, ranging from being "obviously secure" to rigorous proofs of security.
+* **Relevance Arguments:**
+    The relevance of the new functionality for the Bitcoin ecosystem should be argued by outlining clear use cases.
+
+These are not the only factors taken into account when considering to add new functionality.
+The proposed new libsecp256k1 code must be of high quality, including API documentation and tests, as well as featuring a misuse-resistant API design.
+
+We recommend reaching out to other contributors (see [Communication Channels](#communication-channels)) and get feedback before implementing new functionality.
+
+## Communication channels
+
+Most communication about libsecp256k1 occurs on the GitHub repository: in issues, pull request or on the discussion board.
+
+Additionally, there is an IRC channel dedicated to libsecp256k1, with biweekly meetings (see channel topic).
+The channel is `#secp256k1` on Libera Chat.
+The easiest way to participate on IRC is with the web client, [web.libera.chat](https://web.libera.chat/#secp256k1).
+Chat history logs can be found at https://gnusha.org/secp256k1/.
+
+## Contributor workflow & peer review
+
+The Contributor Workflow & Peer Review in libsecp256k1 are similar to Bitcoin Core's workflow and review processes described in its [CONTRIBUTING.md](https://github.com/bitcoin/bitcoin/blob/master/CONTRIBUTING.md).
+
+### Coding conventions
+
+In addition, libsecp256k1 tries to maintain the following coding conventions:
+
+* No runtime heap allocation (e.g., no `malloc`) unless explicitly requested by the caller (via `secp256k1_context_create` or `secp256k1_scratch_space_create`, for example). Moreover, it should be possible to use the library without any heap allocations.
+* The tests should cover all lines and branches of the library (see [Test coverage](#coverage)).
+* Operations involving secret data should be tested for being constant time with respect to the secrets (see [src/ctime_tests.c](src/ctime_tests.c)).
+* Local variables containing secret data should be cleared explicitly to try to delete secrets from memory.
+* Use `secp256k1_memcmp_var` instead of `memcmp` (see [#823](https://github.com/bitcoin-core/secp256k1/issues/823)).
+* As a rule of thumb, the default values for configuration options should target standard desktop machines and align with Bitcoin Core's defaults, and the tests should mostly exercise the default configuration (see [#1549](https://github.com/bitcoin-core/secp256k1/issues/1549#issuecomment-2200559257)).
+
+#### Style conventions
+
+* Commits should be atomic and diffs should be easy to read. For this reason, do not mix any formatting fixes or code moves with actual code changes. Make sure each individual commit is hygienic: that it builds successfully on its own without warnings, errors, regressions, or test failures.
+* New code should adhere to the style of existing, in particular surrounding, code. Other than that, we do not enforce strict rules for code formatting.
+* The code conforms to C89. Most notably, that means that only `/* ... */` comments are allowed (no `//` line comments). Moreover, any declarations in a `{ ... }` block (e.g., a function) must appear at the beginning of the block before any statements. When you would like to declare a variable in the middle of a block, you can open a new block:
+    ```C
+    void secp256k_foo(void) {
+        unsigned int x;              /* declaration */
+        int y = 2*x;                 /* declaration */
+        x = 17;                      /* statement */
+        {
+            int a, b;                /* declaration */
+            a = x + y;               /* statement */
+            secp256k_bar(x, &b);     /* statement */
+        }
+    }
+    ```
+* Use `unsigned int` instead of just `unsigned`.
+* Use `void *ptr` instead of `void* ptr`.
+* Arguments of the publicly-facing API must have a specific order defined in [include/secp256k1.h](include/secp256k1.h).
+* User-facing comment lines in headers should be limited to 80 chars if possible.
+* All identifiers in file scope should start with `secp256k1_`.
+* Avoid trailing whitespace.
+
+### Tests
+
+#### Coverage
+
+This library aims to have full coverage of reachable lines and branches.
+
+To create a test coverage report, configure with `--enable-coverage` (use of GCC is necessary):
+
+    $ ./configure --enable-coverage
+
+Run the tests:
+
+    $ make check
+
+To create a report, `gcovr` is recommended, as it includes branch coverage reporting:
+
+    $ gcovr --exclude 'src/bench*' --print-summary
+
+To create a HTML report with coloured and annotated source code:
+
+    $ mkdir -p coverage
+    $ gcovr --exclude 'src/bench*' --html --html-details -o coverage/coverage.html
+
+#### Exhaustive tests
+
+There are tests of several functions in which a small group replaces secp256k1.
+These tests are *exhaustive* since they provide all elements and scalars of the small group as input arguments (see [src/tests_exhaustive.c](src/tests_exhaustive.c)).
+
+### Benchmarks
+
+See `src/bench*.c` for examples of benchmarks.
--- a/external/secp256k1/Makefile.am
+++ b/external/secp256k1/Makefile.am
@@ -37,7 +37,6 @@ noinst_HEADERS += src/field_10x26_impl.h
 noinst_HEADERS += src/field_5x52.h
 noinst_HEADERS += src/field_5x52_impl.h
 noinst_HEADERS += src/field_5x52_int128_impl.h
-noinst_HEADERS += src/field_5x52_asm_impl.h
 noinst_HEADERS += src/modinv32.h
 noinst_HEADERS += src/modinv32_impl.h
 noinst_HEADERS += src/modinv64.h
@@ -46,6 +45,7 @@ noinst_HEADERS += src/precomputed_ecmult.h
 noinst_HEADERS += src/precomputed_ecmult_gen.h
 noinst_HEADERS += src/assumptions.h
 noinst_HEADERS += src/checkmem.h
+noinst_HEADERS += src/testutil.h
 noinst_HEADERS += src/util.h
 noinst_HEADERS += src/int128.h
 noinst_HEADERS += src/int128_impl.h
@@ -64,6 +64,8 @@ noinst_HEADERS += src/field.h
 noinst_HEADERS += src/field_impl.h
 noinst_HEADERS += src/bench.h
 noinst_HEADERS += src/wycheproof/ecdsa_secp256k1_sha256_bitcoin_test.h
+noinst_HEADERS += src/hsort.h
+noinst_HEADERS += src/hsort_impl.h
 noinst_HEADERS += contrib/lax_der_parsing.h
 noinst_HEADERS += contrib/lax_der_parsing.c
 noinst_HEADERS += contrib/lax_der_privatekey_parsing.h
@@ -153,7 +155,7 @@ endif
 if USE_EXAMPLES
 noinst_PROGRAMS += ecdsa_example
 ecdsa_example_SOURCES = examples/ecdsa.c
-ecdsa_example_CPPFLAGS = -I$(top_srcdir)/include
+ecdsa_example_CPPFLAGS = -I$(top_srcdir)/include -DSECP256K1_STATIC
 ecdsa_example_LDADD = libsecp256k1.la
 ecdsa_example_LDFLAGS = -static
 if BUILD_WINDOWS
@@ -163,7 +165,7 @@ TESTS += ecdsa_example
 if ENABLE_MODULE_ECDH
 noinst_PROGRAMS += ecdh_example
 ecdh_example_SOURCES = examples/ecdh.c
-ecdh_example_CPPFLAGS = -I$(top_srcdir)/include
+ecdh_example_CPPFLAGS = -I$(top_srcdir)/include -DSECP256K1_STATIC
 ecdh_example_LDADD = libsecp256k1.la
 ecdh_example_LDFLAGS = -static
 if BUILD_WINDOWS
@@ -174,7 +176,7 @@ endif
 if ENABLE_MODULE_SCHNORRSIG
 noinst_PROGRAMS += schnorr_example
 schnorr_example_SOURCES = examples/schnorr.c
-schnorr_example_CPPFLAGS = -I$(top_srcdir)/include
+schnorr_example_CPPFLAGS = -I$(top_srcdir)/include -DSECP256K1_STATIC
 schnorr_example_LDADD = libsecp256k1.la
 schnorr_example_LDFLAGS = -static
 if BUILD_WINDOWS
@@ -182,6 +184,28 @@ schnorr_example_LDFLAGS += -lbcrypt
 endif
 TESTS += schnorr_example
 endif
+if ENABLE_MODULE_ELLSWIFT
+noinst_PROGRAMS += ellswift_example
+ellswift_example_SOURCES = examples/ellswift.c
+ellswift_example_CPPFLAGS = -I$(top_srcdir)/include -DSECP256K1_STATIC
+ellswift_example_LDADD = libsecp256k1.la
+ellswift_example_LDFLAGS = -static
+if BUILD_WINDOWS
+ellswift_example_LDFLAGS += -lbcrypt
+endif
+TESTS += ellswift_example
+endif
+if ENABLE_MODULE_MUSIG
+noinst_PROGRAMS += musig_example
+musig_example_SOURCES = examples/musig.c
+musig_example_CPPFLAGS = -I$(top_srcdir)/include -DSECP256K1_STATIC
+musig_example_LDADD = libsecp256k1.la
+musig_example_LDFLAGS = -static
+if BUILD_WINDOWS
+musig_example_LDFLAGS += -lbcrypt
+endif
+TESTS += musig_example
+endif
 endif

 ### Precomputed tables
@@ -189,11 +213,11 @@ EXTRA_PROGRAMS = precompute_ecmult precompute_ecmult_gen
 CLEANFILES = $(EXTRA_PROGRAMS)

 precompute_ecmult_SOURCES = src/precompute_ecmult.c
-precompute_ecmult_CPPFLAGS = $(SECP_CONFIG_DEFINES)
+precompute_ecmult_CPPFLAGS = $(SECP_CONFIG_DEFINES) -DVERIFY
 precompute_ecmult_LDADD = $(COMMON_LIB)

 precompute_ecmult_gen_SOURCES = src/precompute_ecmult_gen.c
-precompute_ecmult_gen_CPPFLAGS = $(SECP_CONFIG_DEFINES)
+precompute_ecmult_gen_CPPFLAGS = $(SECP_CONFIG_DEFINES) -DVERIFY
 precompute_ecmult_gen_LDADD = $(COMMON_LIB)

 # See Automake manual, Section "Errors with distclean".
@@ -241,6 +265,7 @@ maintainer-clean-local: clean-testvectors
 ### Additional files to distribute
 EXTRA_DIST = autogen.sh CHANGELOG.md SECURITY.md
 EXTRA_DIST += doc/release-process.md doc/safegcd_implementation.md
+EXTRA_DIST += doc/ellswift.md doc/musig.md
 EXTRA_DIST += examples/EXAMPLES_COPYING
 EXTRA_DIST += sage/gen_exhaustive_groups.sage
 EXTRA_DIST += sage/gen_split_lambda_constants.sage
@@ -267,3 +292,11 @@ endif
 if ENABLE_MODULE_SCHNORRSIG
 include src/modules/schnorrsig/Makefile.am.include
 endif
+
+if ENABLE_MODULE_MUSIG
+include src/modules/musig/Makefile.am.include
+endif
+
+if ENABLE_MODULE_ELLSWIFT
+include src/modules/ellswift/Makefile.am.include
+endif
--- a/external/secp256k1/README.md
+++ b/external/secp256k1/README.md
@@ -1,11 +1,10 @@
 libsecp256k1
 ============

-[![Build Status](https://api.cirrus-ci.com/github/bitcoin-core/secp256k1.svg?branch=master)](https://cirrus-ci.com/github/bitcoin-core/secp256k1)
 ![Dependencies: None](https://img.shields.io/badge/dependencies-none-success)
 [![irc.libera.chat #secp256k1](https://img.shields.io/badge/irc.libera.chat-%23secp256k1-success)](https://web.libera.chat/#secp256k1)

-Optimized C library for ECDSA signatures and secret/public key operations on curve secp256k1.
+High-performance high-assurance C library for digital signatures and other cryptographic primitives on the secp256k1 elliptic curve.

 This library is intended to be the highest quality publicly available library for cryptography on the secp256k1 curve. However, the primary focus of its development has been for usage in the Bitcoin system and usage unlike Bitcoin's may be less well tested, verified, or suffer from a less well thought out interface. Correct usage requires some care and consideration that the library is fit for your application's purpose.

@@ -21,6 +20,8 @@ Features:
 * Optional module for public key recovery.
 * Optional module for ECDH key exchange.
 * Optional module for Schnorr signatures according to [BIP-340](https://github.com/bitcoin/bips/blob/master/bip-0340.mediawiki).
+* Optional module for ElligatorSwift key exchange according to [BIP-324](https://github.com/bitcoin/bips/blob/master/bip-0324.mediawiki).
+* Optional module for MuSig2 Schnorr multi-signatures according to [BIP-327](https://github.com/bitcoin/bips/blob/master/bip-0327.mediawiki).

 Implementation details
 ----------------------
@@ -34,7 +35,7 @@ Implementation details
  * Expose only higher level interfaces to minimize the API surface and improve application security. ("Be difficult to use insecurely.")
 * Field operations
  * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1).
-    * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys).
+    * Using 5 52-bit limbs
    * Using 10 26-bit limbs (including hand-optimized assembly for 32-bit ARM, by Wladimir J. van der Laan).
      * This is an experimental feature that has not received enough scrutiny to satisfy the standard of quality of this library but is made available for testing and review by the community.
 * Scalar operations
@@ -80,9 +81,9 @@ To maintain a pristine source tree, CMake encourages to perform an out-of-source

    $ mkdir build && cd build
    $ cmake ..
-    $ make
-    $ make check  # run the test suite
-    $ sudo make install  # optional
+    $ cmake --build .
+    $ ctest  # run the test suite
+    $ sudo cmake --install .  # optional

 To compile optional modules (such as Schnorr signatures), you need to run `cmake` with additional flags (such as `-DSECP256K1_ENABLE_MODULE_SCHNORRSIG=ON`). Run `cmake .. -LH` to see the full list of available flags.

@@ -114,31 +115,10 @@ Usage examples can be found in the [examples](examples) directory. To compile th
  * [ECDSA example](examples/ecdsa.c)
  * [Schnorr signatures example](examples/schnorr.c)
  * [Deriving a shared secret (ECDH) example](examples/ecdh.c)
+  * [ElligatorSwift key exchange example](examples/ellswift.c)

 To compile the Schnorr signature and ECDH examples, you also need to configure with `--enable-module-schnorrsig` and `--enable-module-ecdh`.

-Test coverage
-----------
-
-This library aims to have full coverage of the reachable lines and branches.
-
-To create a test coverage report, configure with `--enable-coverage` (use of GCC is necessary):
-
-    $ ./configure --enable-coverage
-
-Run the tests:
-
-    $ make check
-
-To create a report, `gcovr` is recommended, as it includes branch coverage reporting:
-
-    $ gcovr --exclude 'src/bench*' --print-summary
-
-To create a HTML report with coloured and annotated source code:
-
-    $ mkdir -p coverage
-    $ gcovr --exclude 'src/bench*' --html --html-details -o coverage/coverage.html
-
 Benchmark
 ------------
 If configured with `--enable-benchmark` (which is the default), binaries for benchmarking the libsecp256k1 functions will be present in the root directory after the build.
@@ -155,3 +135,8 @@ Reporting a vulnerability
 ------------

 See [SECURITY.md](SECURITY.md)
+
+Contributing to libsecp256k1
+------------
+
+See [CONTRIBUTING.md](CONTRIBUTING.md)
--- a/external/secp256k1/build-aux/m4/bitcoin_secp.m4
+++ b/external/secp256k1/build-aux/m4/bitcoin_secp.m4
@@ -45,6 +45,22 @@ fi
 AC_MSG_RESULT($has_valgrind)
 ])

+AC_DEFUN([SECP_MSAN_CHECK], [
+AC_MSG_CHECKING(whether MemorySanitizer is enabled)
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+  #if defined(__has_feature)
+  #  if __has_feature(memory_sanitizer)
+       /* MemorySanitizer is enabled. */
+  #  elif
+  #    error "MemorySanitizer is disabled."
+  #  endif
+  #else
+  #  error "__has_feature is not defined."
+  #endif
+  ]])], [msan_enabled=yes], [msan_enabled=no])
+AC_MSG_RESULT([$msan_enabled])
+])
+
 dnl SECP_TRY_APPEND_CFLAGS(flags, VAR)
 dnl Append flags to VAR if CC accepts them.
 AC_DEFUN([SECP_TRY_APPEND_CFLAGS], [
--- a/external/secp256k1/ci/cirrus.sh
+++ b/external/secp256k1/ci/cirrus.sh
@@ -4,19 +4,21 @@ set -eux

 export LC_ALL=C

-# Print relevant CI environment to allow reproducing the job outside of CI.
+# Print commit and relevant CI environment to allow reproducing the job outside of CI.
+git show --no-patch
 print_environment() {
    # Turn off -x because it messes up the output
    set +x
    # There are many ways to print variable names and their content. This one
    # does not rely on bash.
    for var in WERROR_CFLAGS MAKEFLAGS BUILD \
-            ECMULTWINDOW ECMULTGENPRECISION ASM WIDEMUL WITH_VALGRIND EXTRAFLAGS \
-            EXPERIMENTAL ECDH RECOVERY SCHNORRSIG \
+            ECMULTWINDOW ECMULTGENKB ASM WIDEMUL WITH_VALGRIND EXTRAFLAGS \
+            EXPERIMENTAL ECDH RECOVERY EXTRAKEYS MUSIG SCHNORRSIG ELLSWIFT \
            SECP256K1_TEST_ITERS BENCH SECP256K1_BENCH_ITERS CTIMETESTS\
            EXAMPLES \
            HOST WRAPPER_CMD \
-            CC CFLAGS CPPFLAGS AR NM
+            CC CFLAGS CPPFLAGS AR NM \
+            UBSAN_OPTIONS ASAN_OPTIONS LSAN_OPTIONS
    do
        eval "isset=\${$var+x}"
        if [ -n "$isset" ]; then
@@ -30,19 +32,15 @@ print_environment() {
 }
 print_environment

-# Start persistent wineserver if necessary.
-# This speeds up jobs with many invocations of wine (e.g., ./configure with MSVC) tremendously.
-case "$WRAPPER_CMD" in
-    *wine*)
-        # Make sure to shutdown wineserver whenever we exit.
-        trap "wineserver -k || true" EXIT INT HUP
-        # This is apparently only reliable when we run a dummy command such as "hh.exe" afterwards.
-        wineserver -p && wine hh.exe
+env >> test_env.log
+
+# If gcc is requested, assert that it's in fact gcc (and not some symlinked Apple clang).
+case "${CC:-undefined}" in
+    *gcc*)
+        $CC -v 2>&1 | grep -q "gcc version" || exit 1;
        ;;
 esac

-env >> test_env.log
-
 if [ -n "${CC+x}" ]; then
    # The MSVC compiler "cl" doesn't understand "-v"
    $CC -v || true
@@ -54,22 +52,55 @@ if [ -n "$WRAPPER_CMD" ]; then
    $WRAPPER_CMD --version
 fi

+# Workaround for https://bugs.kde.org/show_bug.cgi?id=452758 (fixed in valgrind 3.20.0).
+case "${CC:-undefined}" in
+    clang*)
+        if [ "$CTIMETESTS" = "yes" ] && [ "$WITH_VALGRIND" = "yes" ]
+        then
+            export CFLAGS="${CFLAGS:+$CFLAGS }-gdwarf-4"
+        else
+            case "$WRAPPER_CMD" in
+                valgrind*)
+                    export CFLAGS="${CFLAGS:+$CFLAGS }-gdwarf-4"
+                    ;;
+            esac
+        fi
+        ;;
+esac
+
 ./autogen.sh

 ./configure \
    --enable-experimental="$EXPERIMENTAL" \
    --with-test-override-wide-multiply="$WIDEMUL" --with-asm="$ASM" \
    --with-ecmult-window="$ECMULTWINDOW" \
-    --with-ecmult-gen-precision="$ECMULTGENPRECISION" \
+    --with-ecmult-gen-kb="$ECMULTGENKB" \
    --enable-module-ecdh="$ECDH" --enable-module-recovery="$RECOVERY" \
+    --enable-module-ellswift="$ELLSWIFT" \
+    --enable-module-extrakeys="$EXTRAKEYS" \
    --enable-module-schnorrsig="$SCHNORRSIG" \
+    --enable-module-musig="$MUSIG" \
    --enable-examples="$EXAMPLES" \
    --enable-ctime-tests="$CTIMETESTS" \
    --with-valgrind="$WITH_VALGRIND" \
    --host="$HOST" $EXTRAFLAGS

 # We have set "-j<n>" in MAKEFLAGS.
-make
+build_exit_code=0
+make > make.log 2>&1 || build_exit_code=$?
+cat make.log
+if [ $build_exit_code -ne 0 ]; then
+    case "${CC:-undefined}" in
+        *snapshot*)
+            # Ignore internal compiler errors in gcc-snapshot and clang-snapshot
+            grep -e "internal compiler error:" -e "PLEASE submit a bug report" make.log
+            return $?;
+            ;;
+        *)
+            return 1;
+            ;;
+    esac
+fi

 # Print information about binaries so that we can see that the architecture is correct
 file *tests* || true
--- a/external/secp256k1/ci/linux-debian.Dockerfile
+++ b/external/secp256k1/ci/linux-debian.Dockerfile
@@ -1,4 +1,17 @@
-FROM debian:stable
+FROM debian:stable-slim
+
+SHELL ["/bin/bash", "-c"]
+
+WORKDIR /root
+
+# A too high maximum number of file descriptors (with the default value
+# inherited from the docker host) can cause issues with some of our tools:
+#  - sanitizers hanging: https://github.com/google/sanitizers/issues/1662 
+#  - valgrind crashing: https://stackoverflow.com/a/75293014
+# This is not be a problem on our CI hosts, but developers who run the image
+# on their machines may run into this (e.g., on Arch Linux), so warn them.
+# (Note that .bashrc is only executed in interactive bash shells.)
+RUN echo 'if [[ $(ulimit -n) -gt 200000 ]]; then echo "WARNING: Very high value reported by \"ulimit -n\". Consider passing \"--ulimit nofile=32768\" to \"docker run\"."; fi' >> /root/.bashrc

 RUN dpkg --add-architecture i386 && \
    dpkg --add-architecture s390x && \
@@ -11,27 +24,56 @@ RUN dpkg --add-architecture i386 && \
 RUN apt-get update && apt-get install --no-install-recommends -y \
        git ca-certificates \
        make automake libtool pkg-config dpkg-dev valgrind qemu-user \
-        gcc clang llvm libc6-dbg \
+        gcc clang llvm libclang-rt-dev libc6-dbg \
        g++ \
-        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 libubsan1:i386 libasan6:i386 \
+        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 libubsan1:i386 libasan8:i386 \
        gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x \
        gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6-dbg:armhf \
-        gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6-dbg:arm64 \
        gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross libc6-dbg:ppc64el \
        gcc-mingw-w64-x86-64-win32 wine64 wine \
        gcc-mingw-w64-i686-win32 wine32 \
-        sagemath
+        python3 && \
+        if ! ( dpkg --print-architecture | grep --quiet "arm64" ) ; then \
+         apt-get install --no-install-recommends -y \
+         gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6-dbg:arm64 ;\
+        fi && \
+        apt-get clean && rm -rf /var/lib/apt/lists/*

-WORKDIR /root
-# The "wine" package provides a convience wrapper that we need
-RUN apt-get update && apt-get install --no-install-recommends -y \
-        git ca-certificates wine64 wine python3-simplejson python3-six msitools winbind procps && \
-    git clone https://github.com/mstorsjo/msvc-wine && \
-    mkdir /opt/msvc && \
-    python3 msvc-wine/vsdownload.py --accept-license --dest /opt/msvc Microsoft.VisualStudio.Workload.VCTools && \
-    msvc-wine/install.sh /opt/msvc
+# Build and install gcc snapshot
+ARG GCC_SNAPSHOT_MAJOR=15
+RUN apt-get update && apt-get install --no-install-recommends -y wget libgmp-dev libmpfr-dev libmpc-dev flex && \
+    mkdir gcc && cd gcc && \
+    wget --progress=dot:giga --https-only --recursive --accept '*.tar.xz' --level 1 --no-directories "https://gcc.gnu.org/pub/gcc/snapshots/LATEST-${GCC_SNAPSHOT_MAJOR}" && \
+    wget "https://gcc.gnu.org/pub/gcc/snapshots/LATEST-${GCC_SNAPSHOT_MAJOR}/sha512.sum" && \
+    sha512sum --check --ignore-missing sha512.sum && \
+    # We should have downloaded exactly one tar.xz file
+    ls && \
+    [ $(ls *.tar.xz | wc -l) -eq "1" ] && \
+    tar xf *.tar.xz && \
+    mkdir gcc-build && cd gcc-build && \
+    ../*/configure --prefix=/opt/gcc-snapshot --enable-languages=c --disable-bootstrap --disable-multilib --without-isl && \
+    make -j $(nproc) && \
+    make install && \
+    cd ../.. && rm -rf gcc && \
+    ln -s /opt/gcc-snapshot/bin/gcc /usr/bin/gcc-snapshot && \
+    apt-get autoremove -y wget libgmp-dev libmpfr-dev libmpc-dev flex && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Install clang snapshot, see https://apt.llvm.org/
+RUN \
+    # Setup GPG keys of LLVM repository
+    apt-get update && apt-get install --no-install-recommends -y wget && \
+    wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
+    # Add repository for this Debian release
+    . /etc/os-release && echo "deb http://apt.llvm.org/${VERSION_CODENAME} llvm-toolchain-${VERSION_CODENAME} main" >> /etc/apt/sources.list && \
+    apt-get update && \
+    # Determine the version number of the LLVM development branch
+    LLVM_VERSION=$(apt-cache search --names-only '^clang-[0-9]+$' | sort -V | tail -1 | cut -f1 -d" " | cut -f2 -d"-" ) && \
+    # Install
+    apt-get install --no-install-recommends -y "clang-${LLVM_VERSION}" && \
+    # Create symlink
+    ln -s "/usr/bin/clang-${LLVM_VERSION}" /usr/bin/clang-snapshot && \
+    # Clean up
+    apt-get autoremove -y wget && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*

-# Initialize the wine environment. Wait until the wineserver process has
-# exited before closing the session, to avoid corrupting the wine prefix.
-RUN wine64 wineboot --init && \
-    while (ps -A | grep wineserver) > /dev/null; do sleep 1; done
--- a/external/secp256k1/cmake/CheckArm32Assembly.cmake
+++ b/external/secp256k1/cmake/CheckArm32Assembly.cmake
@@ -1,6 +1,6 @@
 function(check_arm32_assembly)
  try_compile(HAVE_ARM32_ASM
-    ${CMAKE_BINARY_DIR}/check_arm32_assembly
-    SOURCES ${CMAKE_SOURCE_DIR}/cmake/source_arm32.s
+    ${PROJECT_BINARY_DIR}/check_arm32_assembly
+    SOURCES ${PROJECT_SOURCE_DIR}/cmake/source_arm32.s
  )
 endfunction()
--- a/external/secp256k1/cmake/CheckMemorySanitizer.cmake
+++ b/external/secp256k1/cmake/CheckMemorySanitizer.cmake
@@ -0,0 +1,18 @@
+include_guard(GLOBAL)
+include(CheckCSourceCompiles)
+
+function(check_memory_sanitizer output)
+  set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+  check_c_source_compiles("
+    #if defined(__has_feature)
+    #  if __has_feature(memory_sanitizer)
+         /* MemorySanitizer is enabled. */
+    #  elif
+    #    error \"MemorySanitizer is disabled.\"
+    #  endif
+    #else
+    #  error \"__has_feature is not defined.\"
+    #endif
+  " HAVE_MSAN)
+  set(${output} ${HAVE_MSAN} PARENT_SCOPE)
+endfunction()
--- a/external/secp256k1/cmake/GeneratePkgConfigFile.cmake
+++ b/external/secp256k1/cmake/GeneratePkgConfigFile.cmake
@@ -0,0 +1,8 @@
+function(generate_pkg_config_file in_file)
+  set(prefix ${CMAKE_INSTALL_PREFIX})
+  set(exec_prefix \${prefix})
+  set(libdir \${exec_prefix}/${CMAKE_INSTALL_LIBDIR})
+  set(includedir \${prefix}/${CMAKE_INSTALL_INCLUDEDIR})
+  set(PACKAGE_VERSION ${PROJECT_VERSION})
+  configure_file(${in_file} ${PROJECT_NAME}.pc @ONLY)
+endfunction()
--- a/Show More
+++ b/Show More