Skip to content

feat: add SWE-bench and TAU-bench benchmark suite #45

feat: add SWE-bench and TAU-bench benchmark suite

feat: add SWE-bench and TAU-bench benchmark suite #45

Workflow file for this run

name: CI Tests
on:
push:
branches: [main, develop, 'feature/**', 'refactor/**']
pull_request:
branches: [main, develop]
workflow_dispatch:
env:
NODE_VERSION: '20'
jobs:
# ===========================================================================
# Stage 1: Fast checks (run on every commit)
# ===========================================================================
lint-and-typecheck:
name: Lint & TypeCheck
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: TypeScript check
run: npx tsc --noEmit
- name: Build
run: npm run build
unit-tests:
name: Unit Tests
runs-on: ubuntu-latest
services:
postgres:
image: postgres:16-alpine
env:
POSTGRES_PASSWORD: testpass123
POSTGRES_DB: kode_test
ports:
- 5433:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Run unit tests
run: npm run test:unit
env:
POSTGRES_HOST: localhost
POSTGRES_PORT: 5433
POSTGRES_DB: kode_test
POSTGRES_USER: postgres
POSTGRES_PASSWORD: testpass123
# ===========================================================================
# Stage 2: Provider E2E tests (run on every commit)
# ===========================================================================
provider-e2e:
name: Provider E2E
runs-on: ubuntu-latest
needs: [lint-and-typecheck, unit-tests]
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
provider:
- anthropic
- openai
- gemini
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Create test environment
run: |
cat > .env.test << 'EOF'
# Anthropic
ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}
ANTHROPIC_MODEL_ID=${{ vars.ANTHROPIC_MODEL_ID }}
ANTHROPIC_BASE_URL=${{ vars.ANTHROPIC_BASE_URL }}
ANTHROPIC_ENABLE_INTERTWINED=0
ANTHROPIC_ENABLE_PDF=0
# OpenAI Chat
OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}
OPENAI_MODEL_ID=${{ vars.OPENAI_MODEL_ID }}
OPENAI_BASE_URL=${{ vars.OPENAI_BASE_URL }}
OPENAI_API=chat
# Gemini
GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }}
GEMINI_MODEL_ID=${{ vars.GEMINI_MODEL_ID }}
GEMINI_BASE_URL=${{ vars.GEMINI_BASE_URL }}
GEMINI_ENABLE_INTERTWINED=1
GEMINI_ENABLE_PDF=1
EOF
- name: Run Anthropic E2E
if: matrix.provider == 'anthropic'
run: npx ts-node tests/e2e/providers/anthropic.test.ts
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: Run OpenAI E2E
if: matrix.provider == 'openai'
run: npx ts-node tests/e2e/providers/openai.test.ts
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Run Gemini E2E
if: matrix.provider == 'gemini'
run: npx ts-node tests/e2e/providers/gemini.test.ts
env:
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
# ===========================================================================
# Stage 3: Agent Integration tests (comprehensive agent workflow testing)
# ===========================================================================
agent-integration:
name: Agent Integration
runs-on: ubuntu-latest
needs: [provider-e2e]
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Create test environment
run: |
cat > .env.test << 'EOF'
# Use Anthropic-compatible API for integration tests
ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}
ANTHROPIC_MODEL_ID=${{ vars.ANTHROPIC_MODEL_ID }}
ANTHROPIC_BASE_URL=${{ vars.ANTHROPIC_BASE_URL }}
EOF
- name: Run Agent Integration Tests
run: npx ts-node tests/integration/agent/ci-integration.test.ts
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# ===========================================================================
# Stage 4: Comprehensive Agent Tests (deep agent workflow testing)
# ===========================================================================
comprehensive-agent-tests:
name: Comprehensive Agent Tests
runs-on: ubuntu-latest
needs: [agent-integration]
timeout-minutes: 45
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Create test environment
run: |
cat > .env.test << 'EOF'
ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}
ANTHROPIC_MODEL_ID=${{ vars.ANTHROPIC_MODEL_ID }}
ANTHROPIC_BASE_URL=${{ vars.ANTHROPIC_BASE_URL }}
EOF
- name: Run Comprehensive Agent Tests
run: npx ts-node --transpileOnly tests/integration/agent/ci-agent.test.ts
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# ===========================================================================
# Summary job
# ===========================================================================
ci-summary:
name: CI Summary
runs-on: ubuntu-latest
needs: [lint-and-typecheck, unit-tests, provider-e2e, agent-integration, comprehensive-agent-tests]
if: always()
steps:
- name: Check results
run: |
echo "=== CI Results ==="
echo "Lint & TypeCheck: ${{ needs.lint-and-typecheck.result }}"
echo "Unit Tests: ${{ needs.unit-tests.result }}"
echo "Provider E2E: ${{ needs.provider-e2e.result }}"
echo "Agent Integration: ${{ needs.agent-integration.result }}"
echo "Comprehensive Agent Tests: ${{ needs.comprehensive-agent-tests.result }}"
if [ "${{ needs.lint-and-typecheck.result }}" == "failure" ] || \
[ "${{ needs.unit-tests.result }}" == "failure" ] || \
[ "${{ needs.provider-e2e.result }}" == "failure" ] || \
[ "${{ needs.agent-integration.result }}" == "failure" ] || \
[ "${{ needs.comprehensive-agent-tests.result }}" == "failure" ]; then
echo "Some tests failed!"
exit 1
fi
echo "All CI checks passed!"