perf: parallelize model downloads and switch to registry cache

- Parallelize all 14 model downloads using ThreadPoolExecutor (6 workers)
  Downloads were sequential (~30 min), now concurrent (~5-10 min)
- Switch Docker cache from type=gha to type=registry (GHCR)
  GHA cache has 10 GB limit causing blob eviction and corrupted builds
  Registry cache has no size limit and persists across runner instances
- Add pip download cache mounts to all pip install layers
  Prevents re-downloading packages when layers rebuild
This commit is contained in:
ashim-hq 2026-04-17 14:54:01 +08:00
parent 2fd0c00564
commit 79c4ed6a35
4 changed files with 69 additions and 25 deletions

View file

@ -99,6 +99,13 @@ jobs:
- uses: docker/setup-buildx-action@v3
- name: Log in to GHCR (for registry cache)
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
- uses: docker/build-push-action@v6
with:
context: .
@ -106,5 +113,5 @@ jobs:
push: false
tags: ashim:ci
build-args: SKIP_MODEL_DOWNLOADS=true
cache-from: type=gha,scope=unified
cache-to: type=gha,mode=max,scope=unified
cache-from: type=registry,ref=ghcr.io/${{ github.repository }}:cache-linux-amd64
cache-to: type=registry,ref=ghcr.io/${{ github.repository }}:cache-ci,mode=max

View file

@ -121,8 +121,8 @@ jobs:
platforms: ${{ matrix.platform }}
labels: ${{ steps.meta.outputs.labels }}
outputs: type=image,"name=ashimhq/ashim,ghcr.io/${{ github.repository }}",push-by-digest=true,name-canonical=true,push=true
cache-from: type=gha,scope=${{ env.PLATFORM_PAIR }}
cache-to: type=gha,mode=max,scope=${{ env.PLATFORM_PAIR }}
cache-from: type=registry,ref=ghcr.io/${{ github.repository }}:cache-${{ env.PLATFORM_PAIR }}
cache-to: type=registry,ref=ghcr.io/${{ github.repository }}:cache-${{ env.PLATFORM_PAIR }},mode=max
- name: Export digest
run: |

View file

@ -164,7 +164,8 @@ RUN apt-get -o Acquire::Retries=3 update && apt-get install -y --no-install-reco
COPY --from=caire-builder /tmp/caire /usr/local/bin/caire
# Python venv - Layer 1: Base packages (rarely change, ~3 GB)
RUN python3 -m venv /opt/venv && \
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m venv /opt/venv && \
/opt/venv/bin/pip install --upgrade pip && \
/opt/venv/bin/pip install \
Pillow==11.1.0 \
@ -172,14 +173,16 @@ RUN python3 -m venv /opt/venv && \
opencv-python-headless==4.10.0.84
# Platform-conditional ONNX runtime
RUN if [ "$TARGETARCH" = "amd64" ]; then \
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETARCH" = "amd64" ]; then \
/opt/venv/bin/pip install onnxruntime-gpu==1.20.1 \
; else \
/opt/venv/bin/pip install onnxruntime==1.20.1 \
; fi
# Python venv - Layer 2: Tool packages (change occasionally, ~2 GB)
RUN if [ "$TARGETARCH" = "amd64" ]; then \
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETARCH" = "amd64" ]; then \
/opt/venv/bin/pip install rembg==2.0.62 && \
/opt/venv/bin/pip install realesrgan==0.3.0 \
--extra-index-url https://download.pytorch.org/whl/cu126 && \
@ -193,17 +196,20 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then \
; fi
# mediapipe 0.10.21 only has amd64 wheels; arm64 maxes out at 0.10.18
RUN if [ "$TARGETARCH" = "amd64" ]; then \
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETARCH" = "amd64" ]; then \
/opt/venv/bin/pip install mediapipe==0.10.21 \
; else \
/opt/venv/bin/pip install mediapipe==0.10.18 \
; fi
# CodeFormer face enhancement (install with --no-deps to avoid numpy 2.x conflict)
RUN /opt/venv/bin/pip install --no-deps codeformer-pip==0.0.4 lpips
RUN --mount=type=cache,target=/root/.cache/pip \
/opt/venv/bin/pip install --no-deps codeformer-pip==0.0.4 lpips
# Re-pin numpy to 1.26.4 in case any transitive dep upgraded it
RUN /opt/venv/bin/pip install numpy==1.26.4
RUN --mount=type=cache,target=/root/.cache/pip \
/opt/venv/bin/pip install numpy==1.26.4
# Pin rembg model storage to a fixed path so models downloaded at build time
# (as root) are found at runtime (as the non-root ashim user, home=/app).

View file

@ -638,21 +638,52 @@ def smoke_test():
def main():
print("Pre-downloading all ML models...\n")
download_lama_model()
download_rembg_models()
download_realesrgan_model()
download_gfpgan_model()
download_codeformer_model()
download_ddcolor_model()
download_codeformer_onnx_model()
download_paddleocr_models()
download_paddleocr_vl_model()
download_scunet_model()
download_nafnet_model()
download_facexlib_models()
download_opencv_colorize_models()
download_mediapipe_task_models()
import concurrent.futures
import threading
print("Pre-downloading all ML models (parallel)...\n")
print_lock = threading.Lock()
# All download functions are independent (separate dirs, separate CDNs).
# Run them in parallel to cut download time from ~30 min to ~5-10 min.
download_fns = [
download_lama_model,
download_rembg_models,
download_realesrgan_model,
download_gfpgan_model,
download_codeformer_model,
download_ddcolor_model,
download_codeformer_onnx_model,
download_paddleocr_models,
download_paddleocr_vl_model,
download_scunet_model,
download_nafnet_model,
download_facexlib_models,
download_opencv_colorize_models,
download_mediapipe_task_models,
]
# 6 workers balances parallelism with CDN rate limits
errors = []
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as pool:
future_to_name = {
pool.submit(fn): fn.__name__ for fn in download_fns
}
for future in concurrent.futures.as_completed(future_to_name):
name = future_to_name[future]
try:
future.result()
except Exception as e:
errors.append((name, e))
print(f"\n*** {name} FAILED: {e}\n")
if errors:
print(f"\n{len(errors)} download(s) failed:")
for name, e in errors:
print(f" {name}: {e}")
sys.exit(1)
print("\nAll downloads complete. Running verification...\n")
verify_mediapipe()
smoke_test()
print("All models downloaded and verified.")