| | |
| | |
| | |
| | |
| | name: Benchmark |
| |
|
| | on: |
| | workflow_dispatch: |
| | inputs: |
| | gpu-series: |
| | description: 'Azure GPU series to run with' |
| | required: true |
| | type: choice |
| | options: |
| | - Standard_NC4as_T4_v3 |
| | - Standard_NC24ads_A100_v4 |
| | - Standard_NC80adis_H100_v5 |
| | sha: |
| | description: 'Commit SHA1 to build' |
| | required: false |
| | type: string |
| | duration: |
| | description: 'Duration of the bench' |
| | type: string |
| | default: 10m |
| |
|
| | push: |
| | branches: |
| | - master |
| | paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp'] |
| | pull_request_target: |
| | types: [opened, synchronize, reopened] |
| | paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp'] |
| | schedule: |
| | - cron: '04 2 * * *' |
| |
|
| | concurrency: |
| | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} |
| | cancel-in-progress: true |
| |
|
| | jobs: |
| | bench-server-baseline: |
| | runs-on: Standard_NC4as_T4_v3 |
| | env: |
| | RUNNER_LABEL: Standard_NC4as_T4_v3 |
| | N_USERS: 8 |
| | DURATION: 10m |
| |
|
| | strategy: |
| | matrix: |
| | model: [phi-2] |
| | ftype: [q4_0, q8_0, f16] |
| | include: |
| | - model: phi-2 |
| | ftype: q4_0 |
| | pr_comment_enabled: "true" |
| |
|
| | if: | |
| | inputs.gpu-series == 'Standard_NC4as_T4_v3' |
| | || github.event_name == 'pull_request_target' |
| | steps: |
| | - name: Clone |
| | id: checkout |
| | uses: actions/checkout@v4 |
| | with: |
| | fetch-depth: 0 |
| | ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} |
| |
|
| | - name: Install python env |
| | id: pipenv |
| | run: | |
| | cd tools/server/bench |
| | python3 -m venv venv |
| | source venv/bin/activate |
| | pip install -r requirements.txt |
| | |
| | - name: Prometheus |
| | id: install_prometheus |
| | run: | |
| | wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz |
| | tar xzf prometheus*.tar.gz --strip-components=1 |
| | ./prometheus --config.file=tools/server/bench/prometheus.yml & |
| | while ! nc -z localhost 9090; do |
| | sleep 0.1 |
| | done |
| | |
| | - name: Set up Go |
| | uses: actions/setup-go@v5 |
| | with: |
| | go-version: '1.21' |
| |
|
| | - name: Install k6 and xk6-sse |
| | id: k6_installation |
| | run: | |
| | cd tools/server/bench |
| | go install go.k6.io/xk6/cmd/xk6@latest |
| | xk6 build master \ |
| | --with github.com/phymbert/xk6-sse |
| | |
| | - name: Build |
| | id: cmake_build |
| | run: | |
| | set -eux |
| | cmake -B build \ |
| | -DGGML_NATIVE=OFF \ |
| | -DLLAMA_BUILD_SERVER=ON \ |
| | -DLLAMA_CUBLAS=ON \ |
| | -DCUDAToolkit_ROOT=/usr/local/cuda \ |
| | -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ |
| | -DCMAKE_CUDA_ARCHITECTURES=75 \ |
| | -DLLAMA_FATAL_WARNINGS=OFF \ |
| | -DLLAMA_ALL_WARNINGS=OFF \ |
| | -DCMAKE_BUILD_TYPE=Release; |
| | cmake --build build --config Release -j $(nproc) --target llama-server |
| | |
| | - name: Download the dataset |
| | id: download_dataset |
| | run: | |
| | cd tools/server/bench |
| | wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json |
| | |
| | - name: Server bench |
| | id: server_bench |
| | env: |
| | HEAD_REF: ${{ github.head_ref || github.ref_name }} |
| | run: | |
| | set -eux |
| | |
| | cd tools/server/bench |
| | source venv/bin/activate |
| | python bench.py \ |
| | --runner-label ${{ env.RUNNER_LABEL }} \ |
| | --name ${{ github.job }} \ |
| | --branch $HEAD_REF \ |
| | --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ |
| | --scenario script.js \ |
| | --duration ${{ github.event.inputs.duration || env.DURATION }} \ |
| | --hf-repo ggml-org/models \ |
| | --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ |
| | --model-path-prefix /models \ |
| | --parallel ${{ env.N_USERS }} \ |
| | -ngl 33 \ |
| | --batch-size 2048 \ |
| | --ubatch-size 256 \ |
| | --ctx-size 16384 \ |
| | --n-prompts 1000 \ |
| | --max-prompt-tokens 1024 \ |
| | --max-tokens 2048 |
| |
|
| | cat results.github.env >> $GITHUB_ENV |
| |
|
| | |
| | rm ShareGPT_V3_unfiltered_cleaned_split.json |
| |
|
| | - uses: actions/upload-artifact@v4 |
| | with: |
| | name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} |
| | compression-level: 9 |
| | path: | |
| | tools/server/bench/*.jpg |
| | tools/server/bench/*.json |
| | tools/server/bench/*.log |
| | |
| | - name: Commit status |
| | uses: Sibz/github-status-action@v1 |
| | with: |
| | authToken: ${{secrets.GITHUB_TOKEN}} |
| | sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} |
| | context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} |
| | description: | |
| | ${{ env.BENCH_RESULTS }} |
| | state: 'success' |
| |
|
| | - name: Upload benchmark images |
| | uses: devicons/public-upload-to-imgur@v2.2.2 |
| | continue-on-error: true |
| | id: imgur_step |
| | with: |
| | client_id: ${{secrets.IMGUR_CLIENT_ID}} |
| | path: | |
| | tools/server/bench/prompt_tokens_seconds.jpg |
| | tools/server/bench/predicted_tokens_seconds.jpg |
| | tools/server/bench/kv_cache_usage_ratio.jpg |
| | tools/server/bench/requests_processing.jpg |
| | |
| | - name: Extract mermaid |
| | id: set_mermaid |
| | run: | |
| | set -eux |
| | |
| | cd tools/server/bench |
| | PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) |
| | echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV |
| | echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV |
| | echo "EOF" >> $GITHUB_ENV |
| |
|
| | PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) |
| | echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV |
| | echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV |
| | echo "EOF" >> $GITHUB_ENV |
| |
|
| | KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) |
| | echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV |
| | echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV |
| | echo "EOF" >> $GITHUB_ENV |
| |
|
| | REQUESTS_PROCESSING=$(cat requests_processing.mermaid) |
| | echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV |
| | echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV |
| | echo "EOF" >> $GITHUB_ENV |
| |
|
| | - name: Extract image url |
| | id: extract_image_url |
| | continue-on-error: true |
| | run: | |
| | set -eux |
| | |
| | echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV |
| | echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV |
| | echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV |
| | echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV |
| |
|
| | - name: Comment PR |
| | uses: mshick/add-pr-comment@v2 |
| | id: comment_pr |
| | if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} |
| | with: |
| | message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} |
| | message: | |
| | <p align="center"> |
| | |
| | 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 |
| |
|
| | </p> |
| |
|
| | <details> |
| |
|
| | <summary>Expand details for performance related PR only</summary> |
| |
|
| | - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} |
| | - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} |
| | - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s |
| | - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s |
| | - ${{ env.BENCH_GRAPH_XLABEL }} |
| |
|
| |
|
| | <p align="center"> |
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" /> |
| |
|
| | <details> |
| |
|
| | <summary>More</summary> |
| |
|
| | ```mermaid |
| | ${{ env.PROMPT_TOKENS_SECONDS }} |
| | ``` |
| |
|
| | </details> |
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/> |
| |
|
| | <details> |
| | <summary>More</summary> |
| |
|
| | ```mermaid |
| | ${{ env.PREDICTED_TOKENS_SECONDS }} |
| | ``` |
| |
|
| | </details> |
| |
|
| | </p> |
| |
|
| | <details> |
| |
|
| | <summary>Details</summary> |
| |
|
| | <p align="center"> |
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" /> |
| |
|
| | <details> |
| | <summary>More</summary> |
| |
|
| | ```mermaid |
| | ${{ env.KV_CACHE_USAGE_RATIO }} |
| | ``` |
| |
|
| | </details> |
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/> |
| |
|
| | <details> |
| | <summary>More</summary> |
| |
|
| | ```mermaid |
| | ${{ env.REQUESTS_PROCESSING }} |
| | ``` |
| |
|
| | </details> |
| |
|
| | </p> |
| | </details> |
| | </details> |
| |
|