refactor: remove OverlayFS delta caching entirely

THE GREAT CULLING: Remove all OverlayFS and delta caching logic.

After extensive investigation and testing, we determined that OverlayFS
file-level layering is fundamentally incompatible with ccache's access
patterns:

- ccache opens files with O_RDWR → kernel must provide writable file handle
- OverlayFS must copy files to upper layer immediately (can't wait)
- Even with metacopy=on, metadata-only files still appear in upper layer
- Result: ~366MB deltas instead of tiny incremental diffs

The fundamental constraint: cannot have all three of:
1. Read-only lower layer (for base sharing)
2. Writable file handles (for O_RDWR)
3. Minimal deltas (for efficient caching)

Changes:
- Removed all OverlayFS mounting/unmounting logic
- Removed workspace and registry tracking
- Removed delta creation and restoration
- Removed use-deltas parameter
- Simplified to direct tar/extract workflow

Before: 726 lines across cache actions
After:  321 lines (-55% reduction)

Benefits:
-  Simpler architecture (direct tar/extract)
-  More maintainable (less code, less complexity)
-  More reliable (fewer moving parts)
-  Same performance (base-only was already used)
-  Clear path forward (restic/borg for future optimization)

Current state works great:
- Build times: 20-30 min → 2-5 min (80% improvement)
- Cache sizes: ~323-609 MB per branch (with zst compression)
- S3 costs: acceptable for current volume

If bandwidth costs become problematic, migrate to restic/borg for
chunk-level deduplication (completely different architecture).
This commit is contained in:
Nicholas Dudfield
2025-10-31 10:30:31 +07:00
parent bd384e6bc1
commit 638cb0afe5
4 changed files with 83 additions and 492 deletions

View File

@@ -1,5 +1,5 @@
name: 'Xahau Cache Restore (S3 + OverlayFS)'
description: 'Drop-in replacement for actions/cache/restore using S3 and OverlayFS for delta caching'
name: 'Xahau Cache Restore (S3)'
description: 'Drop-in replacement for actions/cache/restore using S3 storage'
inputs:
path:
@@ -28,10 +28,6 @@ inputs:
description: 'Check if a cache entry exists for the given input(s) without downloading it'
required: false
default: 'false'
use-deltas:
description: 'Enable delta caching (download/upload incremental changes). Set to false for base-only caching.'
required: false
default: 'true'
# Note: Composite actions can't access secrets.* directly - must be passed from workflow
aws-access-key-id:
description: 'AWS Access Key ID for S3 access'
@@ -48,13 +44,13 @@ outputs:
description: 'The key that was used to restore the cache (may be from restore-keys)'
value: ${{ steps.restore-cache.outputs.cache-primary-key }}
cache-matched-key:
description: 'The key that matched (same as cache-primary-key for compatibility)'
value: ${{ steps.restore-cache.outputs.cache-primary-key }}
description: 'The key that was used to restore the cache (exact or prefix match)'
value: ${{ steps.restore-cache.outputs.cache-matched-key }}
runs:
using: 'composite'
steps:
- name: Restore cache from S3 with OverlayFS
- name: Restore cache from S3
id: restore-cache
shell: bash
env:
@@ -67,133 +63,42 @@ runs:
TARGET_PATH: ${{ inputs.path }}
FAIL_ON_MISS: ${{ inputs.fail-on-cache-miss }}
LOOKUP_ONLY: ${{ inputs.lookup-only }}
USE_DELTAS: ${{ inputs.use-deltas }}
COMMIT_MSG: ${{ github.event.head_commit.message }}
run: |
set -euo pipefail
echo "=========================================="
echo "Xahau Cache Restore (S3 + OverlayFS)"
echo "Xahau Cache Restore (S3)"
echo "=========================================="
echo "Target path: ${TARGET_PATH}"
echo "Primary key: ${CACHE_KEY}"
echo "Cache key: ${CACHE_KEY}"
echo "S3 bucket: s3://${S3_BUCKET}"
echo "Use deltas: ${USE_DELTAS}"
echo ""
# Normalize target path (expand tilde and resolve to absolute path)
# This ensures consistent path comparison in the mount registry
if [[ "${TARGET_PATH}" == ~* ]]; then
# Expand tilde manually (works even if directory doesn't exist yet)
TARGET_PATH="${HOME}${TARGET_PATH:1}"
fi
TARGET_PATH=$(realpath -m "${TARGET_PATH}")
echo "Normalized target path: ${TARGET_PATH}"
echo ""
# Generate unique cache workspace
CACHE_HASH=$(echo "${CACHE_KEY}" | md5sum | cut -d' ' -f1)
CACHE_WORKSPACE="/tmp/xahau-cache-${CACHE_HASH}"
echo "Cache workspace: ${CACHE_WORKSPACE}"
# Check for [ci-clear-cache] tag in commit message
if echo "${COMMIT_MSG}" | grep -q '\[ci-clear-cache\]'; then
echo ""
echo "🗑️ [ci-clear-cache] detected in commit message"
echo "Clearing cache for key: ${CACHE_KEY}"
echo ""
# Delete base layer
S3_BASE_KEY="s3://${S3_BUCKET}/${CACHE_KEY}-base.tar.zst"
if aws s3 ls "${S3_BASE_KEY}" --region "${S3_REGION}" >/dev/null 2>&1; then
echo "Deleting base layer: ${S3_BASE_KEY}"
aws s3 rm "${S3_BASE_KEY}" --region "${S3_REGION}" 2>/dev/null || true
echo "✓ Base layer deleted"
else
echo " No base layer found to delete"
fi
# Delete all delta layers for this key
echo "Deleting all delta layers matching: ${CACHE_KEY}-delta-*"
DELTA_COUNT=$(aws s3 ls "s3://${S3_BUCKET}/" --region "${S3_REGION}" | grep "${CACHE_KEY}-delta-" | wc -l || echo "0")
DELTA_COUNT=$(echo "${DELTA_COUNT}" | tr -d ' \n') # Trim whitespace
if [ "${DELTA_COUNT}" -gt 0 ]; then
aws s3 rm "s3://${S3_BUCKET}/" --recursive \
--exclude "*" \
--include "${CACHE_KEY}-delta-*" \
--region "${S3_REGION}" 2>/dev/null || true
echo "✓ Deleted ${DELTA_COUNT} delta layer(s)"
else
echo " No delta layers found to delete"
fi
echo ""
echo "✅ Cache cleared successfully"
echo "Build will proceed from scratch (bootstrap mode)"
echo ""
fi
# Create OverlayFS directory structure
mkdir -p "${CACHE_WORKSPACE}"/{base,upper,work,merged}
# Function to try downloading from S3
# Function to try restoring a cache key
try_restore_key() {
local try_key="$1"
local s3_base="s3://${S3_BUCKET}/${try_key}-base.tar.zst"
local key=$1
local s3_key="s3://${S3_BUCKET}/${key}-base.tar.zst"
echo "Trying cache key: ${try_key}"
# Check if base exists (one base per key, immutable)
echo "Checking for base layer..."
if aws s3 ls "${s3_base}" --region "${S3_REGION}" >/dev/null 2>&1; then
echo "✓ Found base layer: ${s3_base}"
if [ "${LOOKUP_ONLY}" = "true" ]; then
echo "Lookup-only mode: cache exists, skipping download"
return 0
fi
# Download base layer
echo "Downloading base layer..."
aws s3 cp "${s3_base}" /tmp/cache-base.tar.zst --region "${S3_REGION}" --quiet
# Extract base layer
echo "Extracting base layer..."
tar -xf /tmp/cache-base.tar.zst -C "${CACHE_WORKSPACE}/base"
rm /tmp/cache-base.tar.zst
# Query for latest timestamped delta (only if use-deltas enabled)
if [ "${USE_DELTAS}" = "true" ]; then
echo "Querying for latest delta..."
LATEST_DELTA=$(aws s3api list-objects-v2 \
--bucket "${S3_BUCKET}" \
--prefix "${try_key}-delta-" \
--region "${S3_REGION}" \
--query 'sort_by(Contents, &LastModified)[-1].Key' \
--output text 2>/dev/null || echo "")
if [ -n "${LATEST_DELTA}" ] && [ "${LATEST_DELTA}" != "None" ]; then
echo "✓ Found latest delta: ${LATEST_DELTA}"
echo "Downloading delta layer..."
aws s3 cp "s3://${S3_BUCKET}/${LATEST_DELTA}" /tmp/cache-delta.tar.zst --region "${S3_REGION}" --quiet
echo "Extracting delta layer..."
tar -xf /tmp/cache-delta.tar.zst -C "${CACHE_WORKSPACE}/upper" 2>/dev/null || true
rm /tmp/cache-delta.tar.zst
else
echo " No delta layer found (this is fine for first build)"
fi
else
echo " Delta caching disabled (use-deltas: false)"
fi
echo "Checking for key: ${key}"
if aws s3 ls "${s3_key}" --region "${S3_REGION}" >/dev/null 2>&1; then
echo "✓ Found cache: ${s3_key}"
return 0
else
echo "✗ No base layer found for key: ${try_key}"
echo "✗ Not found: ${key}"
return 1
fi
}
# Try primary key first
# Try exact match first
MATCHED_KEY=""
EXACT_MATCH="false"
@@ -208,12 +113,8 @@ runs:
echo ""
echo "Primary key not found, trying restore-keys..."
# Split restore-keys by newline
while IFS= read -r restore_key; do
# Skip empty lines
[ -z "${restore_key}" ] && continue
# Trim whitespace
restore_key=$(echo "${restore_key}" | xargs)
if try_restore_key "${restore_key}"; then
@@ -231,7 +132,6 @@ runs:
if [ -z "${MATCHED_KEY}" ]; then
echo ""
echo "❌ No cache found for key: ${CACHE_KEY}"
echo "This is BOOTSTRAP mode - first build for this cache key"
if [ "${FAIL_ON_MISS}" = "true" ]; then
echo "fail-on-cache-miss is enabled, failing workflow"
@@ -241,16 +141,11 @@ runs:
# Set outputs for cache miss
echo "cache-hit=false" >> $GITHUB_OUTPUT
echo "cache-primary-key=" >> $GITHUB_OUTPUT
echo "cache-matched-key=" >> $GITHUB_OUTPUT
# Create empty cache directory for bootstrap
# Create empty cache directory
mkdir -p "${TARGET_PATH}"
# Record bootstrap mode for save action
# Format: path:workspace:matched_key:primary_key:exact_match:use_deltas
# For bootstrap: workspace="bootstrap", matched_key=primary_key, exact_match=false
MOUNT_REGISTRY="/tmp/xahau-cache-mounts.txt"
echo "${TARGET_PATH}:bootstrap:${CACHE_KEY}:${CACHE_KEY}:false:${USE_DELTAS}" >> "${MOUNT_REGISTRY}"
echo ""
echo "=========================================="
echo "Cache restore completed (bootstrap mode)"
@@ -262,36 +157,30 @@ runs:
# If lookup-only, we're done
if [ "${LOOKUP_ONLY}" = "true" ]; then
echo "cache-hit=${EXACT_MATCH}" >> $GITHUB_OUTPUT
echo "cache-primary-key=${MATCHED_KEY}" >> $GITHUB_OUTPUT
# Clean up workspace
rm -rf "${CACHE_WORKSPACE}"
echo "cache-primary-key=${CACHE_KEY}" >> $GITHUB_OUTPUT
echo "cache-matched-key=${MATCHED_KEY}" >> $GITHUB_OUTPUT
echo ""
echo "=========================================="
echo "Cache lookup completed (lookup-only mode)"
echo "Cache exists: ${MATCHED_KEY}"
echo "=========================================="
exit 0
fi
# Mount OverlayFS
# Download and extract cache
S3_KEY="s3://${S3_BUCKET}/${MATCHED_KEY}-base.tar.zst"
TEMP_TARBALL=$(mktemp /tmp/xahau-cache-XXXXXX.tar.zst)
echo ""
echo "Mounting OverlayFS..."
sudo mount -t overlay overlay \
-o lowerdir="${CACHE_WORKSPACE}/base",upperdir="${CACHE_WORKSPACE}/upper",workdir="${CACHE_WORKSPACE}/work" \
"${CACHE_WORKSPACE}/merged"
echo "Downloading cache..."
aws s3 cp "${S3_KEY}" "${TEMP_TARBALL}" --region "${S3_REGION}"
# Verify mount
if mount | grep -q "${CACHE_WORKSPACE}/merged"; then
echo "✓ OverlayFS mounted successfully"
else
echo "❌ Failed to mount OverlayFS"
exit 1
fi
TARBALL_SIZE=$(du -h "${TEMP_TARBALL}" | cut -f1)
echo "✓ Downloaded: ${TARBALL_SIZE}"
# Create target directory parent if needed
TARGET_PARENT=$(dirname "${TARGET_PATH}")
mkdir -p "${TARGET_PARENT}"
# Create parent directory if needed
mkdir -p "$(dirname "${TARGET_PATH}")"
# Remove existing target if it exists
if [ -e "${TARGET_PATH}" ]; then
@@ -299,30 +188,24 @@ runs:
rm -rf "${TARGET_PATH}"
fi
# Symlink target path to merged view
echo "Creating symlink: ${TARGET_PATH} -> ${CACHE_WORKSPACE}/merged"
ln -s "${CACHE_WORKSPACE}/merged" "${TARGET_PATH}"
# Create target directory and extract
mkdir -p "${TARGET_PATH}"
echo ""
echo "Extracting cache..."
zstd -d -c "${TEMP_TARBALL}" | tar -xf - -C "${TARGET_PATH}"
echo "✓ Cache extracted to: ${TARGET_PATH}"
# Save mount info for cleanup/save later
# Format: path:workspace:matched_key:primary_key:exact_match:use_deltas
# This tells save action whether to create new base (partial match) or just delta (exact match)
MOUNT_REGISTRY="/tmp/xahau-cache-mounts.txt"
echo "${TARGET_PATH}:${CACHE_WORKSPACE}:${MATCHED_KEY}:${CACHE_KEY}:${EXACT_MATCH}:${USE_DELTAS}" >> "${MOUNT_REGISTRY}"
# Cleanup
rm -f "${TEMP_TARBALL}"
# Set outputs
echo "cache-hit=${EXACT_MATCH}" >> $GITHUB_OUTPUT
echo "cache-primary-key=${MATCHED_KEY}" >> $GITHUB_OUTPUT
# Show statistics
echo ""
echo "Cache statistics:"
echo " Base layer size: $(du -sh ${CACHE_WORKSPACE}/base 2>/dev/null | cut -f1 || echo '0')"
echo " Delta layer size: $(du -sh ${CACHE_WORKSPACE}/upper 2>/dev/null | cut -f1 || echo '0')"
echo " Merged view size: $(du -sh ${CACHE_WORKSPACE}/merged 2>/dev/null | cut -f1 || echo '0')"
echo "cache-primary-key=${CACHE_KEY}" >> $GITHUB_OUTPUT
echo "cache-matched-key=${MATCHED_KEY}" >> $GITHUB_OUTPUT
echo ""
echo "=========================================="
echo "Cache restore completed successfully"
echo "Exact match: ${EXACT_MATCH}"
echo "Cache hit: ${EXACT_MATCH}"
echo "Matched key: ${MATCHED_KEY}"
echo "=========================================="