Add workflow to run evaluation on a subset of datasets (#222)

*Issue #, if available:* *Description of changes:* This PR adds a workflow that will run the evaluation script on `chronos-bolt-small` for a subset of datasets specified in `ci/evaluate/backtest_configs.yaml`. After evaluation, a comment will be made on the PR. The workflow will only run if the `run-eval` label is present on a PR. The end-to-end workflow has been split into two workflows: - `eval-model.yml`: only has read access (can be run from forks). This will evaluate the model and upload the metrics CSV file as a Github artifact. - `eval-pr-comment.yml`: has read and write access (can only be run when in the `main` branch). This will be triggered when the first job finishes, will download the CSV from the eval job and make the comment. According to [this post](https://securitylab.github.com/resources/github-actions-preventing-pwn-requests/), splitting into two jobs as done here is the recommended and secure way to do this. **NOTE**: The first steps works as expected, but we can only test the second step after the merging because this workflow needs to be part of the `main` branch for this to work. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: Abdul Fatir Ansari <ansarnd@amazon.de>
2026-05-24 10:08:33 +00:00 · 2024-12-02 10:05:57 +01:00 · 2024-12-02 10:05:57 +01:00 · eac768ce28
commit eac768ce28
parent e3bbda7207
3 changed files with 126 additions and 0 deletions
--- a/.github/workflows/eval-model.yml
+++ b/.github/workflows/eval-model.yml
@ -0,0 +1,35 @@
+# Evaluates Chronos-Bolt (Small) model on selected datasets
+name: Evaluate
+
+on:
+  # Runs only with read privilages for the GITHUB_TOKEN
+  pull_request:
+    branches: ["main"] # Run on PRs to main branch
+
+jobs:
+  evaluate-and-post:
+    if: contains(github.event.pull_request.labels.*.name, 'run-eval')  # Only run if 'run-eval' label is added
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Dependencies
+        run: pip install ".[evaluation]" -f https://download.pytorch.org/whl/cpu/torch_stable.html
+
+      - name: Run Eval Script
+        run: python scripts/evaluation/evaluate.py ci/evaluate/backtest_config.yaml eval-ci-metrics.csv --chronos-model-id=amazon/chronos-bolt-small --device=cpu --torch-dtype=float32
+      
+      - name: Upload CSV
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-metrics
+          path: eval-ci-metrics.csv
+          retention-days: 1
+          overwrite: true
--- a/.github/workflows/eval-pr-comment.yml
+++ b/.github/workflows/eval-pr-comment.yml
@ -0,0 +1,54 @@
+# Post evaluation results from the "Evaluate" workflow as a PR comment
+name: Post Eval Metrics
+
+on:
+  # Runs with read & write privilages for the GITHUB_TOKEN
+  workflow_run:
+    workflows: ["Evaluate"]
+    types:
+      - completed
+
+jobs:
+  comment-eval-results:
+    if: >
+      github.event.workflow_run.event == 'pull_request' &&
+      github.event.workflow_run.conclusion == 'success'
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read # for downloading artifacts
+      pull-requests: write # for posting PR comment
+
+    steps:
+      - name: Download Eval Metrics
+        uses: actions/download-artifact@v4
+        with:
+          name: eval-metrics
+          path: eval-metrics-artifact/
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+
+      - name: Display structure of downloaded files
+        run: ls -R
+
+      - name: Read CSV
+        id: csv
+        uses: juliangruber/read-file-action@v1
+        with:
+          path: eval-metrics-artifact/eval-ci-metrics.csv
+
+      - name: Create Markdown Table
+        uses: petems/csv-to-md-table-action@master
+        id: csv-table-output
+        with:
+          csvinput: ${{ steps.csv.outputs.content }}
+
+      - name: Post Table as a Comment
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          repository: ${{ github.repository }}
+          issue-number: ${{ github.event.pull_request.number }}
+          body: |
+            ### Evaluation Metrics
+            ${{steps.csv-table-output.outputs.markdown-table}}
+          reactions: rocket
--- a/ci/evaluate/backtest_config.yaml
+++ b/ci/evaluate/backtest_config.yaml
@ -0,0 +1,37 @@
+# From In-domain
+- name: taxi_30min # 30 min
+  hf_repo: autogluon/chronos_datasets
+  offset: -48
+  prediction_length: 48
+  num_rolls: 1
+# From Zero-shot
+- name: ETTh # Hourly
+  hf_repo: autogluon/chronos_datasets_extra
+  offset: -24
+  prediction_length: 24
+  num_rolls: 1
+- name: monash_covid_deaths # Daily
+  hf_repo: autogluon/chronos_datasets
+  offset: -30
+  prediction_length: 30
+  num_rolls: 1
+- name: monash_nn5_weekly # Weekly
+  hf_repo: autogluon/chronos_datasets
+  offset: -8
+  prediction_length: 8
+  num_rolls: 1
+- name: monash_fred_md # Monthly
+  hf_repo: autogluon/chronos_datasets
+  offset: -12
+  prediction_length: 12
+  num_rolls: 1
+- name: monash_m3_quarterly # Quarterly
+  hf_repo: autogluon/chronos_datasets
+  offset: -8
+  prediction_length: 8
+  num_rolls: 1
+- name: monash_tourism_yearly # Yearly
+  hf_repo: autogluon/chronos_datasets
+  offset: -4
+  prediction_length: 4
+  num_rolls: 1