/* global React, Button, Icon, LoadingSpinner, Pill, Select, EmptyState, ConfirmDialog, useApiResource, apiFetch, formatDateTime, formatRelative, formatDuration, formatNumber, shortId, displayModelName, useToast, useAuth, requirePaidAction, canUseTenantAction, tenantActionDisabledReason, WorkflowTargetPicker, WorkflowOnboardingAgentStart, WorkflowOnboardingUseCase, WorkflowOnboardingReview, WorkflowOnboardingManual, useWorkflowDraftFlow, McpServerCard, McpConnectModal, useMcpConnectFormState, useMcpProvisioningPoll, buildMcpConnectPayload, validateMcpConnectFormInputs, runMcpOAuthFlow, startOAuthConnect */
//
// Benchmark area
// ----------------
// Routing is intentionally flat. The sidebar "Benchmark" link opens a
// benchmark index; clicking a row opens the consolidated workflow detail.
//
//   /benchmark                         → benchmark workflow index table.
//   /benchmark/workflows               → same index table (legacy alias).
//   /benchmark/workflows/:id           → workflow detail: switcher header,
//                                        aggregate leaderboard, past-batches
//                                        table, and the inline batch panel
//                                        for the row currently selected.
//   /benchmark/workflows/:id?batch=... → deep-link a specific batch into
//                                        the inline panel (refresh-safe).
//
// Benchmark source management lives at the top of the /benchmark
// index page. The MCP/CLI source pages under /sources/* are
// production-only — competitor (benchmark) sources are added,
// listed, and deleted from the two cards above the workflows table.
//
// Implementation notes
// --------------------
// - All data fetched via `useApiResource` to share auth / loading state with
//   the rest of the app.
// - The inline batch panel auto-refreshes while the batch is still active;
//   selecting another row in the past-batches table swaps the panel via
//   the `?batch=` query param (replace-state, so back/forward isn't
//   spammed by row clicks).
// - The "+ New benchmark" button first picks sources so the workflow agent
//   has real tool context, then chooses/defines the workflow, then picks
//   tester harnesses/models:
//     Screen 1 — pick sources (internal + benchmark, multi-select).
//     Screen 2 — choose an existing workflow, let Armature draft a workflow
//                definition from the selected source tools, or fill it manually.
//     Screen 3 — pick tester targets (rendered with the same
//                WorkflowTargetPicker used in the production workflow editor).
//   No visible step indicator — each screen has its own header and footer.
//   On submit we POST /api/benchmark/workflows
//   (mcpServerId is null — benchmark workflows don't pin a primary source)
//   and immediately POST /api/benchmark/workflows/:id/batches with the
//   chosen sources + targets. Subsequent "Run benchmark" clicks open a
//   matching dialog pre-filled from the last batch's config.
// - Per-cell rows in the inline batch panel link DIRECTLY to /runs/:id —
//   the run trace page now surfaces all the per-run benchmark details
//   that used to live in a slide-over drawer.

const {
  useCallback: useCbB,
  useEffect: useEffectB,
  useMemo: useMemoB,
  useRef: useRefB,
  useState: useStateB,
} = React;

const BENCHMARK_HARNESS_LOGOS = {
  claude_code: { src: '/frontend/assets/logos/claude-code.png', label: 'Claude Code' },
  claude: { src: '/frontend/assets/logos/anthropic.svg.png', label: 'Claude' },
  chatgpt: { src: '/frontend/assets/logos/chatgpt.png', label: 'ChatGPT' },
  codex: { src: '/frontend/assets/logos/codex.png', label: 'Codex' },
  gemini: { src: '/frontend/assets/logos/gemini.png', label: 'Gemini CLI' },
  cursor: { src: '/frontend/assets/logos/cursor.png', label: 'Cursor' },
  openclaw: { src: '/frontend/assets/logos/openclaw.svg', label: 'Openclaw' },
  opencode: { src: '/frontend/assets/logos/opencode.svg', label: 'OpenCode' },
};

// sdk_key → harness logo. The model_catalog.sdk_key 'claude' is the Claude
// Code CLI (workers/src/cli/claude-runner.ts), NOT the Anthropic API
// consumer harness — that one is 'anthropic_api'. Mapping sdk_key='claude'
// to the API-harness logo (Anthropic 'Claude') used to show the wrong icon
// next to Claude Code rows in benchmark grids.
const BENCHMARK_SDK_LOGOS = {
  claude: BENCHMARK_HARNESS_LOGOS.claude_code,
  codex: BENCHMARK_HARNESS_LOGOS.codex,
  gemini: BENCHMARK_HARNESS_LOGOS.gemini,
  cursor: BENCHMARK_HARNESS_LOGOS.cursor,
  openclaw: BENCHMARK_HARNESS_LOGOS.openclaw,
  opencode: BENCHMARK_HARNESS_LOGOS.opencode,
  anthropic_api: BENCHMARK_HARNESS_LOGOS.claude,
  openai_api: BENCHMARK_HARNESS_LOGOS.chatgpt,
};

// ---------------------------------------------------------------------------
// Top-level dispatcher
// ---------------------------------------------------------------------------

// `section` is supplied as either undefined or 'workflows'. The legacy
// 'batches' section is gone — its allowlist entry was dropped from
// app.jsx and any leftover deep links 404 to the dashboard.
function BenchmarkPage({ navigate, queryString, section: _section, recordId }) {
  if (recordId) {
    return (
      <BenchmarkWorkflowDetailView
        navigate={navigate}
        workflowId={recordId}
        queryString={queryString} />
    );
  }
  return <BenchmarkIndexView navigate={navigate} />;
}

// ---------------------------------------------------------------------------
// Index: /benchmark → benchmark workflow table
// ---------------------------------------------------------------------------

function BenchmarkIndexView({ navigate }) {
  const auth = useAuth();
  const toast = useToast();
  const canManageBenchmarks = canUseTenantAction(auth);
  const canManageSources = canUseTenantAction(auth, 'editor');
  const sourceActionDisabledReason = tenantActionDisabledReason(auth, 'editor');
  const benchmarkActionDisabledReason = tenantActionDisabledReason(auth);
  const workflows = useApiResource('/api/benchmark/workflows');
  const sources = useApiResource('/api/benchmark/sources');
  const productionSources = useApiResource('/api/mcp-servers');
  const catalog = useApiResource('/api/catalog/models');
  const [createOpen, setCreateOpen] = useStateB(false);
  const [creating, setCreating] = useStateB(false);
  // `null` until the user clicks one of the two "Add benchmark…" CTAs;
  // then 'mcp' or 'cli'. The connect modal is conditionally mounted on
  // the chosen kind so its formState hook locks to the right shape.
  const [addBenchmarkKind, setAddBenchmarkKind] = useStateB(null);
  const [pendingDeleteSource, setPendingDeleteSource] = useStateB(null);
  const [deletingSource, setDeletingSource] = useStateB(false);

  const benchmarkSourceRows = sources.data?.rows || [];
  const benchmarkMcpRows = useMemoB(
    () => benchmarkSourceRows.filter((row) => (row.target_kind || row.targetKind || 'mcp') === 'mcp'),
    [benchmarkSourceRows],
  );
  const benchmarkCliRows = useMemoB(
    () => benchmarkSourceRows.filter((row) => (row.target_kind || row.targetKind) === 'cli'),
    [benchmarkSourceRows],
  );
  const sourcesLoading = sources.loading && !sources.data;

  const allSourceRows = useMemoB(() => {
    const benchmark = benchmarkSourceRows.map((row) => ({ ...row, _origin: 'benchmark' }));
    const production = (productionSources.data?.rows || []).map((row) => ({ ...row, _origin: 'production' }));
    return [...production, ...benchmark];
  }, [benchmarkSourceRows, productionSources.data]);

  const rows = workflows.data?.rows || [];

  async function confirmDeleteSource() {
    if (!pendingDeleteSource) return;
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    setDeletingSource(true);
    try {
      await apiFetch(`/api/mcp-servers/${pendingDeleteSource.id}`, { method: 'DELETE' });
      toast.show({ tone: 'ok', title: `Deleted ${pendingDeleteSource.name}` });
      setPendingDeleteSource(null);
      await sources.reload();
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Delete failed', description: err.message });
    } finally {
      setDeletingSource(false);
    }
  }

  async function createBenchmarkWorkflow(spec, { runFirstBatch }) {
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    setCreating(true);
    try {
      const { mcpServerIds, testerTargets, ...workflowSpec } = spec;
      const result = await apiFetch('/api/benchmark/workflows', {
        method: 'POST',
        body: JSON.stringify({
          ...workflowSpec,
          // Benchmark workflows don't pin a primary source — the matrix
          // is decided per batch. We pass null explicitly so the
          // server-side validator picks the benchmark code path.
          mcpServerId: null,
        }),
      });
      const newId = result?.workflow?.id;
      if (!newId) {
        toast.show({ tone: 'ok', title: 'Benchmark workflow created' });
        setCreateOpen(false);
        workflows.reload();
        return;
      }
      if (!runFirstBatch) {
        toast.show({ tone: 'ok', title: 'Benchmark workflow created' });
        setCreateOpen(false);
        navigate(`/benchmark/workflows/${newId}`);
        return;
      }
      try {
        const batchResult = await apiFetch(`/api/benchmark/workflows/${newId}/batches`, {
          method: 'POST',
          body: JSON.stringify({
            mcpServerIds: mcpServerIds || [],
            testerTargets: testerTargets || [],
          }),
        });
        const queued = batchResult?.summary?.queued || 0;
        const failed = batchResult?.summary?.failed || 0;
        toast.show({
          tone: failed === 0 ? 'ok' : queued === 0 ? 'bad' : 'warn',
          title: 'Benchmark started',
          description: `${queued} cell${queued === 1 ? '' : 's'} queued${failed > 0 ? `, ${failed} failed to dispatch` : ''}.`,
        });
        setCreateOpen(false);
        const newBatchId = batchResult?.batch?.id;
        navigate(newBatchId
          ? `/benchmark/workflows/${newId}?batch=${newBatchId}`
          : `/benchmark/workflows/${newId}`);
      } catch (batchErr) {
        toast.show({
          tone: 'warn',
          title: 'Workflow created — batch dispatch failed',
          description: batchErr.message,
        });
        setCreateOpen(false);
        navigate(`/benchmark/workflows/${newId}`);
      }
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Create failed', description: err.message });
    } finally {
      setCreating(false);
    }
  }

  const noSources = allSourceRows.length === 0;
  const createDisabledReason = !canManageBenchmarks
    ? benchmarkActionDisabledReason
    : noSources
      ? 'Add at least one MCP or benchmark source first'
      : undefined;
  function openCreate() {
    if (!canManageBenchmarks || noSources) return;
    setCreateOpen(true);
  }
  function benchmarkRowClickHitsAction(event) {
    const target = event.target;
    return Boolean(target && typeof target.closest === 'function' && target.closest(
      'button, a, input, select, textarea, [role="button"], [data-row-action]',
    ));
  }
  function openBenchmark(workflow, event) {
    if (benchmarkRowClickHitsAction(event)) return;
    navigate(`/benchmark/workflows/${workflow.id}`);
  }
  function handleBenchmarkRowKeyDown(workflow, event) {
    if (benchmarkRowClickHitsAction(event) || (event.key !== 'Enter' && event.key !== ' ')) return;
    event.preventDefault();
    navigate(`/benchmark/workflows/${workflow.id}`);
  }

  return (
    <div className="page-inner benchmark-page">
      <div className="ui-page-head">
        <h1 className="ui-page-title">Benchmark</h1>
        <div className="ui-page-actions">
          <Button
            variant="primary"
            size="sm"
            disabled={!canManageBenchmarks || noSources}
            title={createDisabledReason}
            onClick={openCreate}>
            <Icon name="plus" size={13} />New benchmark
          </Button>
        </div>
      </div>

      <BenchmarkSourcesArea
        loading={sourcesLoading}
        mcpRows={benchmarkMcpRows}
        cliRows={benchmarkCliRows}
        canManageSources={canManageSources}
        sourceActionDisabledReason={sourceActionDisabledReason}
        onAddMcp={() => setAddBenchmarkKind('mcp')}
        onAddCli={() => setAddBenchmarkKind('cli')}
        onDelete={(server) => setPendingDeleteSource({ id: server.id, name: server.name })} />

      {workflows.loading && !workflows.data ? (
        <div className="benchmark-loader"><LoadingSpinner /></div>
      ) : rows.length === 0 ? (
        <div className="card benchmark-empty-card">
          <EmptyState
            icon="chart"
            title="No benchmark workflows yet"
            body="Define a task once, then run it across multiple MCPs/CLIs and models to compare results."
            action={(
              <Button
                variant="primary"
                disabled={!canManageBenchmarks || noSources}
                title={createDisabledReason}
                onClick={openCreate}>
                <Icon name="plus" size={13} />New benchmark
              </Button>
            )} />
        </div>
      ) : (
        <div className="card benchmark-table-wrap benchmark-index-table-wrap">
          <table className="ui-table benchmark-index-table">
            <thead>
              <tr>
                <th>Benchmark</th>
                <th>Last run</th>
                <th>Sources tested</th>
                <th>Leaderboard</th>
                <th>Status</th>
                <th className="col-actions"><span className="ui-visually-hidden">Actions</span></th>
              </tr>
            </thead>
            <tbody>
              {rows.map((workflow) => {
                const desc = workflow.description || workflow.current_version?.tester_prompt || '';
                const latestAt = workflow.benchmark_latest_batch_started_at
                  || workflow.last_run?.started_at
                  || workflow.last_run?.created_at
                  || workflow.updated_at;
                const sourceCount = workflow.benchmark_latest_source_count || 0;
                const batchCount = workflow.benchmark_batch_count || 0;
                const totalCells = workflow.benchmark_latest_total_runs || 0;
                return (
                  <tr
                    key={workflow.id}
                    className="ui-table-row-clickable benchmark-index-row"
                    tabIndex={0}
                    aria-label={`Open benchmark ${workflow.name || 'Untitled benchmark'}`}
                    onClick={(event) => openBenchmark(workflow, event)}
                    onKeyDown={(event) => handleBenchmarkRowKeyDown(workflow, event)}>
                    <td>
                      <div className="benchmark-index-main">
                        <span className="benchmark-index-name">{workflow.name || 'Untitled benchmark'}</span>
                        {desc ? <span className="benchmark-index-desc text-xs muted">{desc}</span> : null}
                      </div>
                    </td>
                    <td>
                      <div className="benchmark-index-stacked">
                        <span>{latestAt ? formatRelative(latestAt) : 'Never'}</span>
                        {batchCount > 0 && (
                          <span className="text-xs muted">{formatNumber(batchCount)} batch{batchCount === 1 ? '' : 'es'}</span>
                        )}
                      </div>
                    </td>
                    <td>
                      <div className="benchmark-index-stacked">
                        <span>{sourceCount > 0 ? formatNumber(sourceCount) : '—'}</span>
                        {totalCells > 0 && <span className="text-xs muted">{formatNumber(totalCells)} cells last run</span>}
                      </div>
                    </td>
                    <td>
                      {workflow.benchmark_top_source_name ? (
                        <div className="benchmark-index-leader">
                          <span className="benchmark-index-leader-name">{workflow.benchmark_top_source_name}</span>
                          <BenchmarkScoreChip score={roundBenchmarkScore(workflow.benchmark_top_avg_score)} />
                        </div>
                      ) : (
                        <span className="text-xs muted">No completed runs</span>
                      )}
                    </td>
                    <td>
                      {workflow.benchmark_latest_batch_status
                        ? <BenchmarkBatchStatusPill status={workflow.benchmark_latest_batch_status} />
                        : <span className="text-xs muted">Not run</span>}
                    </td>
                    <td className="col-actions">
                      <div className="benchmark-index-actions">
                        <button
                          type="button"
                          className="btn btn-sm btn-ghost"
                          data-row-action="true"
                          onClick={() => navigate(`/benchmark/workflows/${workflow.id}`)}
                          title="Open benchmark">
                          <Icon name="chevronRight" size={12} />
                        </button>
                      </div>
                    </td>
                  </tr>
                );
              })}
            </tbody>
          </table>
        </div>
      )}

      {createOpen && (
        <BenchmarkWorkflowCreateDialog
          onClose={() => setCreateOpen(false)}
          saving={creating}
          sources={allSourceRows}
          models={catalog.data?.models || []}
          onSubmit={createBenchmarkWorkflow} />
      )}

      {addBenchmarkKind && (
        <BenchmarkSourceConnectModal
          targetKind={addBenchmarkKind}
          onClose={() => setAddBenchmarkKind(null)}
          refreshSources={sources.reload} />
      )}

      <ConfirmDialog
        open={Boolean(pendingDeleteSource)}
        tone="danger"
        title={pendingDeleteSource ? `Delete benchmark source "${pendingDeleteSource.name}"?` : ''}
        description="This removes the source. Past batches that referenced it keep their results, but it's no longer available for new runs. This action can't be undone."
        confirmLabel="Delete source"
        confirmBusyLabel="Deleting"
        busy={deletingSource}
        onCancel={() => { if (!deletingSource) setPendingDeleteSource(null); }}
        onConfirm={confirmDeleteSource} />
    </div>
  );
}

// ---------------------------------------------------------------------------
// Benchmark sources area: two side-by-side cards (MCPs and CLIs) that sit
// at the top of the /benchmark index page. Mirrors the production source
// card density 1:1 so the two surfaces feel interchangeable.
// ---------------------------------------------------------------------------

function BenchmarkSourcesArea({
  loading,
  mcpRows,
  cliRows,
  canManageSources,
  sourceActionDisabledReason,
  onAddMcp,
  onAddCli,
  onDelete,
}) {
  // Same on-brand offset "window" container used on /runs/:id and the
  // Insights daily digest, with the mono eyebrow on the left and the
  // surface URL on the right.
  return (
    <section className="window benchmark-sources-window" aria-label="Benchmark sources">
      <div className="window-head">
        <span>BENCHMARK SOURCES</span>
      </div>
      <div className="benchmark-sources-grid">
        <BenchmarkSourceColumn
          title="Benchmark MCPs"
          kind="mcp"
          loading={loading}
          rows={mcpRows}
          emptyTitle="No benchmark MCPs yet"
          emptyBody="Add competitor MCPs to compare against your own."
          addLabel="Add benchmark MCP"
          canManageSources={canManageSources}
          sourceActionDisabledReason={sourceActionDisabledReason}
          onAdd={onAddMcp}
          onDelete={onDelete} />
        <BenchmarkSourceColumn
          title="Benchmark CLIs"
          kind="cli"
          loading={loading}
          rows={cliRows}
          emptyTitle="No benchmark CLIs yet"
          emptyBody="Add competitor CLIs to compare against your own."
          addLabel="Add benchmark CLI"
          canManageSources={canManageSources}
          sourceActionDisabledReason={sourceActionDisabledReason}
          onAdd={onAddCli}
          onDelete={onDelete} />
      </div>
    </section>
  );
}

function BenchmarkSourceColumn({
  title,
  kind,
  loading,
  rows,
  emptyTitle,
  emptyBody,
  addLabel,
  canManageSources,
  sourceActionDisabledReason,
  onAdd,
  onDelete,
}) {
  return (
    <section className="benchmark-source-column">
      <div className="ui-section-head">
        <div className="ui-section-title">
          {title}
          {!loading && rows.length > 0 && (
            <span className="ui-section-count">{rows.length}</span>
          )}
        </div>
        <Button
          size="sm"
          disabled={loading || !canManageSources}
          onClick={onAdd}
          title={loading
            ? 'Loading…'
            : !canManageSources ? sourceActionDisabledReason : undefined}>
          <Icon name="plus" size={13} />{addLabel}
        </Button>
      </div>
      {loading ? (
        <div className="mcp-server-grid">
          {Array.from({ length: 2 }).map((_, idx) => (
            <div key={idx} className="mcp-card mcp-card-loading">
              <span className="skel skel-mid"></span>
              <span className="skel skel-long" style={{ marginTop: 10 }}></span>
              <span className="skel skel-short" style={{ marginTop: 10 }}></span>
            </div>
          ))}
        </div>
      ) : rows.length === 0 ? (
        <div className="ui-surface alerts-empty">
          <EmptyState
            icon={kind === 'cli' ? 'terminal' : 'mcp'}
            title={emptyTitle}
            body={emptyBody} />
        </div>
      ) : (
        <div className="mcp-server-grid">
          {rows.map((server) => (
            <McpServerCard
              key={server.id}
              server={server}
              monitors={[]}
              onOpen={null}
              onPeekMonitors={null}
              onDelete={canManageSources ? () => onDelete(server) : null} />
          ))}
        </div>
      )}
    </section>
  );
}

// ---------------------------------------------------------------------------
// Connect-flow modal for adding a benchmark MCP or CLI source. Reuses the
// shared `useMcpConnectFormState` hook + `McpConnectModal` shell from the
// production sources page, plus the shared `useMcpProvisioningPoll` for
// the CLI/hosted progress timeline. The submit path always POSTs to
// /api/benchmark/sources (so the row is stamped is_benchmark = true) and
// skips the post-create tool-monitor wizard — benchmark sources are
// never monitored.
// ---------------------------------------------------------------------------

function BenchmarkSourceConnectModal({ targetKind, onClose, refreshSources }) {
  const toast = useToast();
  const formState = useMcpConnectFormState({ targetKind });
  const [serverSaving, setServerSaving] = useStateB(false);
  const [oauthInFlight, setOauthInFlight] = useStateB(null);
  const [cliSetupState, setCliSetupState] = useStateB(null);
  const [pendingCliTargetId, setPendingCliTargetId] = useStateB(null);
  const [pendingCliPollCount, setPendingCliPollCount] = useStateB(0);
  const [message, setMessage] = useStateB('');

  // Reuse the production-page polling hook so the progress-timeline
  // behavior stays in lock-step. `refreshSources` is the parent's
  // useApiResource reload — it returns the new list response so the
  // hook can pull row-status overrides AND the failed-row error code
  // from the source list (Greptile #457 P2). Without this await the
  // failure modal would silently omit error codes that only land on
  // the list row.
  useMcpProvisioningPoll({
    cliSetupState, setCliSetupState,
    pendingCliTargetId, setPendingCliTargetId,
    pendingCliPollCount, setPendingCliPollCount,
    reload: refreshSources,
  });

  function cancelOauthInFlight() {
    const popup = oauthInFlight?.popup;
    if (popup && !popup.closed) {
      try { popup.close(); } catch (_error) {}
    }
  }

  // Probes the new source's tool surface so it materializes in the
  // workflow picker, without dragging the operator into the
  // tool-monitor wizard.
  async function quietProbe(serverId) {
    if (!serverId) return;
    try {
      await apiFetch(`/api/mcp-servers/${serverId}/probe`, { method: 'POST' });
      refreshSources();
    } catch (probeError) {
      toast.show({
        tone: 'warn',
        title: 'Could not probe benchmark source',
        description: probeError.message
          || 'The source was added but its tool surface could not be discovered.',
      });
    }
  }

  async function addBenchmarkSource() {
    if (serverSaving) return;
    setServerSaving(true);
    setMessage('');
    try {
      const validationError = validateMcpConnectFormInputs(formState);
      if (validationError) throw new Error(validationError);
      const built = buildMcpConnectPayload(formState);
      const created = await apiFetch('/api/benchmark/sources', {
        method: 'POST',
        body: JSON.stringify(built.payload),
      });
      const newServerId = created?.mcpServer?.id;

      if (newServerId && (built.kind === 'cli' || built.kind === 'hosted')) {
        setPendingCliTargetId(newServerId);
        setPendingCliPollCount(0);
        setCliSetupState({
          serverId: newServerId,
          serverName: formState.serverName.trim(),
          setupKind: built.kind,
          provisioningRunId: created?.provisioningRun?.id || null,
          status: created?.mcpServer?.provisioning_status || 'pending',
          failedAtStatus: null,
          errorCode: null,
          updatedAt: new Date().toISOString(),
        });
        toast.show({
          tone: 'ok',
          title: 'Benchmark source added',
          description: 'Provisioning has started. The source will be available once ready.',
        });
        refreshSources();
        return;
      }

      if (newServerId && built.kind === 'remote' && built.oauth) {
        const oauthStart = await startOAuthConnect({
          mcpServerId: newServerId,
          providerSlug: built.oauth.providerSlug,
          requestedScopes: built.oauth.requestedScopes,
          authConfig: {},
        });
        refreshSources();
        const inFlight = {
          popup: null,
          providerLabel: built.oauth.providerSlug || 'the provider',
          redirectUriMode: oauthStart?.redirectUriMode === 'loopback' ? 'loopback' : 'web',
        };
        setOauthInFlight(inFlight);
        let popupResult;
        try {
          popupResult = await runMcpOAuthFlow(oauthStart, {
            onPopupOpened: (popup) => setOauthInFlight((current) => current ? { ...current, popup } : current),
          });
        } finally {
          setOauthInFlight(null);
        }
        if (popupResult.outcome === 'navigated') return;
        refreshSources();
        if (popupResult.outcome === 'connected') {
          toast.show({
            tone: 'ok',
            title: 'Benchmark source added',
            description: 'OAuth completed. Discovering tools…',
          });
          await quietProbe(newServerId);
          onClose();
        } else if (popupResult.outcome === 'failed') {
          toast.show({
            tone: 'bad',
            title: 'OAuth failed',
            description: popupResult.message || 'OAuth authorization did not complete.',
          });
        } else if (popupResult.outcome === 'pending') {
          toast.show({
            tone: 'warn',
            title: 'OAuth completed; provisioning is still in progress',
            description: 'Tools will appear once provisioning finishes.',
          });
          onClose();
        } else if (popupResult.outcome === 'closed') {
          setMessage('Sign-in was canceled. Submit again to retry.');
        }
        return;
      }

      // Remote source with non-OAuth auth (api-key / none). Fall straight
      // through to a probe so the tool surface materializes.
      toast.show({
        tone: 'ok',
        title: 'Benchmark source added',
        description: 'You can now use this source in benchmark batches.',
      });
      refreshSources();
      onClose();
      if (newServerId && built.kind === 'remote') {
        await quietProbe(newServerId);
      }
    } catch (error) {
      toast.show({ tone: 'bad', title: 'Connect failed', description: error.message });
    } finally {
      setServerSaving(false);
    }
  }

  function handleCancel() {
    if (serverSaving) return;
    formState.reset();
    setMessage('');
    setCliSetupState(null);
    setPendingCliTargetId(null);
    setPendingCliPollCount(0);
    onClose();
  }

  return (
    <McpConnectModal
      open
      message={message}
      formState={formState}
      saving={serverSaving}
      oauthInFlight={oauthInFlight}
      onCancelOauth={cancelOauthInFlight}
      onSubmit={addBenchmarkSource}
      onCancel={handleCancel}
      cliSetupState={cliSetupState}
      submitLabel="Add benchmark source"
      submitLoadingLabel="Adding benchmark source..."
      showOpenTargetCta={false}
      onOpenSetupTarget={() => undefined} />
  );
}

// ---------------------------------------------------------------------------
// Workflow detail view (header + aggregate leaderboard + past batches +
// inline batch panel)
// ---------------------------------------------------------------------------

function BenchmarkWorkflowDetailView({ navigate, workflowId, queryString }) {
  const auth = useAuth();
  const toast = useToast();
  const canManageBenchmarks = canUseTenantAction(auth);
  const benchmarkActionDisabledReason = tenantActionDisabledReason(auth);
  const workflow = useApiResource(`/api/benchmark/workflows/${workflowId}`, [workflowId]);
  const allWorkflows = useApiResource('/api/benchmark/workflows');
  const batches = useApiResource(`/api/benchmark/workflows/${workflowId}/batches`, [workflowId]);
  const leaderboard = useApiResource(`/api/benchmark/workflows/${workflowId}/leaderboard`, [workflowId]);
  const sources = useApiResource('/api/benchmark/sources');
  const productionSources = useApiResource('/api/mcp-servers');
  const catalog = useApiResource('/api/catalog/models');
  const [editing, setEditing] = useStateB(false);
  const [savingEdit, setSavingEdit] = useStateB(false);
  const [runOpen, setRunOpen] = useStateB(false);
  const [runSaving, setRunSaving] = useStateB(false);
  const [createOpen, setCreateOpen] = useStateB(false);
  const [creating, setCreating] = useStateB(false);
  const [archivedBatchIds, setArchivedBatchIds] = useStateB(() => new Set());
  const [archivingBatchId, setArchivingBatchId] = useStateB(null);

  const [name, setName] = useStateB('');
  const [description, setDescription] = useStateB('');
  const [prompt, setPrompt] = useStateB('');
  const [criteriaText, setCriteriaText] = useStateB('');

  // Past-batches sort state. Defaults to "Started DESC" (most-recent first),
  // which matches the server response order — so users see "no movement" on
  // first paint, but the Started column shows the ▼ indicator so it's
  // immediately obvious how the rows got there.
  const [batchesSortKey, setBatchesSortKey] = useStateB(DEFAULT_BATCHES_SORT.key);
  const [batchesSortDir, setBatchesSortDir] = useStateB(DEFAULT_BATCHES_SORT.dir);
  const batchesSortIsDefault = isDefaultSort(batchesSortKey, batchesSortDir, DEFAULT_BATCHES_SORT);
  function setBatchesSort(key) {
    if (batchesSortKey === key) {
      setBatchesSortDir((d) => (d === 'desc' ? 'asc' : 'desc'));
      return;
    }
    setBatchesSortKey(key);
    setBatchesSortDir(BENCHMARK_DESC_DEFAULT_KEYS.has(key) ? 'desc' : 'asc');
  }
  function resetBatchesSort() {
    setBatchesSortKey(DEFAULT_BATCHES_SORT.key);
    setBatchesSortDir(DEFAULT_BATCHES_SORT.dir);
  }

  useEffectB(() => {
    if (!workflow.data) return;
    const wf = workflow.data.workflow || {};
    const version = workflow.data.currentVersion || {};
    const criteria = workflow.data.criteria || [];
    setName(wf.name || '');
    setDescription(wf.description || '');
    setPrompt(version.tester_prompt || '');
    setCriteriaText(criteria.map((c) => c.criterion_text).join('\n'));
  }, [workflow.data]);

  const allSourceRows = useMemoB(() => {
    const benchmark = (sources.data?.rows || []).map((row) => ({ ...row, _origin: 'benchmark' }));
    const production = (productionSources.data?.rows || []).map((row) => ({ ...row, _origin: 'production' }));
    return [...production, ...benchmark];
  }, [sources.data, productionSources.data]);

  // Resolve the inlined batch from the URL query (?batch=<id>) or default
  // to the most recent batch. Re-deriving on every render keeps both
  // user-driven row clicks and freshly-loaded batches.data flowing
  // through the same selection path.
  const batchRows = (batches.data?.rows || []).filter((b) => !archivedBatchIds.has(b.id));
  const params = useMemoB(() => new URLSearchParams(queryString || ''), [queryString]);
  const requestedBatchId = params.get('batch');
  const inlinedBatchId = useMemoB(() => {
    if (!batchRows.length) return null;
    if (requestedBatchId && batchRows.some((b) => b.id === requestedBatchId)) {
      return requestedBatchId;
    }
    return batchRows[0].id;
  }, [batchRows, requestedBatchId]);
  // Must stay above any early returns below — `workflow.loading` /
  // `workflow.error` bail out before reaching the JSX, so hoisting this
  // hook keeps the render's hook count stable across loading → loaded
  // transitions (Rules of Hooks).
  const sortedBatchRows = useMemoB(
    () => sortBenchmarkBatches(batchRows, batchesSortKey, batchesSortDir),
    [batchRows, batchesSortKey, batchesSortDir],
  );

  function selectBatch(batchId) {
    const next = new URLSearchParams(queryString || '');
    if (batchId) next.set('batch', batchId);
    else next.delete('batch');
    const qs = next.toString();
    navigate(
      `/benchmark/workflows/${workflowId}${qs ? `?${qs}` : ''}`,
      { replace: true },
    );
  }

  function navigateToWorkflow(nextId) {
    if (!nextId || nextId === workflowId) return;
    // Drop ?batch= because batch ids belong to a single workflow.
    navigate(`/benchmark/workflows/${nextId}`);
  }

  async function createBenchmarkWorkflow(spec, { runFirstBatch }) {
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    setCreating(true);
    try {
      const { mcpServerIds, testerTargets, ...workflowSpec } = spec;
      const result = await apiFetch('/api/benchmark/workflows', {
        method: 'POST',
        body: JSON.stringify({ ...workflowSpec, mcpServerId: null }),
      });
      const newId = result?.workflow?.id;
      if (!newId) {
        toast.show({ tone: 'ok', title: 'Benchmark workflow created' });
        setCreateOpen(false);
        allWorkflows.reload();
        return;
      }
      if (!runFirstBatch) {
        toast.show({ tone: 'ok', title: 'Benchmark workflow created' });
        setCreateOpen(false);
        navigate(`/benchmark/workflows/${newId}`);
        return;
      }
      try {
        const batchResult = await apiFetch(`/api/benchmark/workflows/${newId}/batches`, {
          method: 'POST',
          body: JSON.stringify({
            mcpServerIds: mcpServerIds || [],
            testerTargets: testerTargets || [],
          }),
        });
        const queued = batchResult?.summary?.queued || 0;
        const failed = batchResult?.summary?.failed || 0;
        toast.show({
          tone: failed === 0 ? 'ok' : queued === 0 ? 'bad' : 'warn',
          title: 'Benchmark started',
          description: `${queued} cell${queued === 1 ? '' : 's'} queued${failed > 0 ? `, ${failed} failed to dispatch` : ''}.`,
        });
        setCreateOpen(false);
        const newBatchId = batchResult?.batch?.id;
        navigate(newBatchId
          ? `/benchmark/workflows/${newId}?batch=${newBatchId}`
          : `/benchmark/workflows/${newId}`);
      } catch (batchErr) {
        toast.show({
          tone: 'warn',
          title: 'Workflow created — batch dispatch failed',
          description: batchErr.message,
        });
        setCreateOpen(false);
        navigate(`/benchmark/workflows/${newId}`);
      }
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Create failed', description: err.message });
    } finally {
      setCreating(false);
    }
  }

  async function saveEdit() {
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    setSavingEdit(true);
    try {
      await apiFetch(`/api/benchmark/workflows/${workflowId}`, {
        method: 'PATCH',
        body: JSON.stringify({
          name: name.trim(),
          description: description.trim() || null,
          testerPrompt: prompt.trim(),
          criteria: criteriaText.split('\n').map((line) => line.trim()).filter(Boolean),
        }),
      });
      toast.show({ tone: 'ok', title: 'Benchmark workflow saved' });
      setEditing(false);
      workflow.reload();
      allWorkflows.reload();
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Save failed', description: err.message });
    } finally {
      setSavingEdit(false);
    }
  }

  async function runBatch(spec) {
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    setRunSaving(true);
    try {
      const result = await apiFetch(`/api/benchmark/workflows/${workflowId}/batches`, {
        method: 'POST',
        body: JSON.stringify(spec),
      });
      const queued = result?.summary?.queued || 0;
      const failed = result?.summary?.failed || 0;
      toast.show({
        tone: failed === 0 ? 'ok' : queued === 0 ? 'bad' : 'warn',
        title: 'Benchmark started',
        description: `${queued} cell${queued === 1 ? '' : 's'} queued${failed > 0 ? `, ${failed} failed to dispatch` : ''}.`,
      });
      setRunOpen(false);
      const newBatchId = result?.batch?.id;
      // Reload batches first so the new row exists before we change the
      // ?batch= query param — otherwise inlinedBatchId falls back to the
      // previous most-recent row for one render cycle.
      await batches.reload();
      if (newBatchId) selectBatch(newBatchId);
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Run failed', description: err.message });
    } finally {
      setRunSaving(false);
    }
  }

  async function archive() {
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    if (!window.confirm('Archive this benchmark workflow? Past batches will remain accessible.')) return;
    try {
      await apiFetch(`/api/benchmark/workflows/${workflowId}`, { method: 'DELETE' });
      toast.show({ tone: 'ok', title: 'Benchmark workflow archived' });
      navigate('/benchmark');
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Archive failed', description: err.message });
    }
  }

  async function archiveBatch(e, batchId) {
    e.stopPropagation();
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    if (!window.confirm('Archive this batch? It will be hidden from the past-batches list and excluded from the leaderboard.')) return;
    setArchivingBatchId(batchId);
    try {
      await apiFetch(`/api/benchmark/batches/${batchId}/archive`, { method: 'PATCH', body: JSON.stringify({ archived: true }) });
      setArchivedBatchIds((prev) => new Set([...prev, batchId]));
      toast.show({ tone: 'ok', title: 'Batch archived' });
      leaderboard.reload();
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Archive failed', description: err.message });
    } finally {
      setArchivingBatchId(null);
    }
  }

  if (workflow.loading && !workflow.data) {
    return (
      <div className="page-inner benchmark-page">
        <div className="benchmark-loader"><LoadingSpinner /></div>
      </div>
    );
  }

  if (workflow.error) {
    return (
      <div className="page-inner benchmark-page">
        <div className="card benchmark-empty-card">
          <EmptyState
            icon="alert"
            title="Couldn't load this benchmark"
            body={workflow.error.message || 'Try reloading the page.'}
            action={(
              <Button variant="ghost" onClick={() => navigate('/benchmark')}>
                <Icon name="chevronLeft" size={12} />Back to benchmarks
              </Button>
            )} />
        </div>
      </div>
    );
  }

  const wf = workflow.data?.workflow || {};
  const leaderboardRows = leaderboard.data?.rows || [];
  const workflowOptions = (allWorkflows.data?.rows || []).map((row) => ({
    value: row.id,
    label: row.name || 'Untitled benchmark',
  }));
  // Pre-fill the run dialog from the most recent batch's config so users
  // can re-run the same matrix in one click.
  const runDefaults = pickRunDialogDefaults(batchRows);
  const noSources = allSourceRows.length === 0;
  const createDisabledReason = !canManageBenchmarks
    ? benchmarkActionDisabledReason
    : noSources
      ? 'Add at least one MCP or benchmark source first'
      : undefined;

  return (
    <div className="page-inner benchmark-page">
      <header className="ui-page-head benchmark-detail-head">
        <div className="benchmark-detail-head__title">
          <BenchmarkTitleSwitcher
            currentId={workflowId}
            currentLabel={wf.name || 'Benchmark'}
            options={workflowOptions}
            onSelect={navigateToWorkflow}
            onCreateNew={() => setCreateOpen(true)}
            createDisabled={!canManageBenchmarks || noSources}
            createDisabledHint={createDisabledReason} />
        </div>
        <div className="ui-page-actions">
          <Button size="sm" variant="ghost" onClick={() => navigate('/benchmark')}>
            <Icon name="chart" size={12} />Manage sources
          </Button>
          {!editing && (
            <Button
              size="sm"
              variant="secondary"
              disabled={!canManageBenchmarks}
              title={!canManageBenchmarks ? benchmarkActionDisabledReason : undefined}
              onClick={() => {
                if (!canManageBenchmarks) return;
                setEditing(true);
              }}>
              <Icon name="edit" size={12} />Edit definition
            </Button>
          )}
          <Button
            size="sm"
            variant="secondary"
            disabled={!canManageBenchmarks}
            title={!canManageBenchmarks ? benchmarkActionDisabledReason : undefined}
            onClick={archive}>
            <Icon name="trash" size={12} />Archive
          </Button>
          <Button
            size="sm"
            variant="primary"
            disabled={!canManageBenchmarks}
            title={!canManageBenchmarks ? benchmarkActionDisabledReason : undefined}
            onClick={() => {
              if (!canManageBenchmarks) return;
              setRunOpen(true);
            }}>
            <Icon name="play" size={12} />Run benchmark
          </Button>
        </div>
      </header>

      {wf.description && !editing && (
        <p className="benchmark-detail-description text-sm muted">{wf.description}</p>
      )}

      {editing && (
        <section className="card benchmark-editor-card">
          <div className="card-header">
            <div className="card-title">Edit definition</div>
          </div>
          <div className="benchmark-editor-card__body">
            <section className="mcp-form-section">
              <h4 className="mcp-form-section-title">Name</h4>
              <input
                className="input"
                value={name}
                disabled={!canManageBenchmarks || savingEdit}
                onChange={(e) => setName(e.target.value)} />
            </section>
            <section className="mcp-form-section">
              <h4 className="mcp-form-section-title">Description</h4>
              <textarea
                className="input"
                rows={2}
                value={description}
                disabled={!canManageBenchmarks || savingEdit}
                onChange={(e) => setDescription(e.target.value)} />
            </section>
            <section className="mcp-form-section">
              <h4 className="mcp-form-section-title">Tester prompt</h4>
              <textarea
                className="input"
                rows={4}
                value={prompt}
                disabled={!canManageBenchmarks || savingEdit}
                onChange={(e) => setPrompt(e.target.value)} />
            </section>
            <section className="mcp-form-section">
              <h4 className="mcp-form-section-title">Success criteria</h4>
              <p className="mcp-form-section-desc">One per line. Each line becomes one judge criterion.</p>
              <textarea
                className="input"
                rows={3}
                value={criteriaText}
                disabled={!canManageBenchmarks || savingEdit}
                onChange={(e) => setCriteriaText(e.target.value)} />
            </section>
          </div>
          <div className="benchmark-editor-card__actions">
            <Button disabled={savingEdit} onClick={() => setEditing(false)}>Cancel</Button>
            <Button
              variant="primary"
              disabled={!canManageBenchmarks || savingEdit}
              title={!canManageBenchmarks ? benchmarkActionDisabledReason : undefined}
              loading={savingEdit}
              loadingLabel="Saving..."
              onClick={saveEdit}>
              Save changes
            </Button>
          </div>
        </section>
      )}

      <section className="benchmark-section">
        <div className="ui-section-head">
          <div className="ui-section-title">Aggregate leaderboard</div>
          <span className="text-xs muted">Average across all completed batches</span>
        </div>
        <BenchmarkLeaderboardTable
          rows={leaderboardRows}
          loading={leaderboard.loading && !leaderboard.data}
          emptyHint="Run the benchmark at least once to populate this leaderboard." />
      </section>

      <section className="benchmark-section">
        <div className="ui-section-head">
          <div className="ui-section-title">Past batches</div>
          <span className="ui-section-count">{batchRows.length}</span>
        </div>
        {(batches.loading && !batches.data) ? (
          <div className="benchmark-loader benchmark-loader--small"><LoadingSpinner /></div>
        ) : batchRows.length === 0 ? (
          <div className="card benchmark-empty-card">
            <EmptyState
              icon="clock"
              title="No batches yet"
              body={'Click "Run benchmark" above to launch the first batch.'} />
          </div>
        ) : (
          <div className="card benchmark-table-wrap">
            <BenchmarkTableToolbar visible={!batchesSortIsDefault} onReset={resetBatchesSort} />
            <table className="ui-table benchmark-batches-table">
              <thead>
                <tr>
                  <BenchmarkSortHeader label="Batch" sortKey="batch" current={batchesSortKey} dir={batchesSortDir} onSort={setBatchesSort} />
                  <BenchmarkSortHeader label="Status" sortKey="status" current={batchesSortKey} dir={batchesSortDir} onSort={setBatchesSort} />
                  <BenchmarkSortHeader label="Cells" sortKey="cells" current={batchesSortKey} dir={batchesSortDir} onSort={setBatchesSort} />
                  <BenchmarkSortHeader label="Started" sortKey="started" current={batchesSortKey} dir={batchesSortDir} onSort={setBatchesSort} />
                  <BenchmarkSortHeader label="Duration" sortKey="duration" current={batchesSortKey} dir={batchesSortDir} onSort={setBatchesSort} />
                  <th className="col-actions"></th>
                </tr>
              </thead>
              <tbody>
                {sortedBatchRows.map((batch) => {
                  const isSelected = batch.id === inlinedBatchId;
                  return (
                    <tr
                      key={batch.id}
                      className={`ui-table-row-clickable${isSelected ? ' is-selected' : ''}`}
                      aria-current={isSelected ? 'true' : undefined}
                      onClick={() => selectBatch(batch.id)}>
                      <td><span className="mono text-xs">{shortId(batch.id)}</span></td>
                      <td><BenchmarkBatchStatusPill status={batch.status} /></td>
                      <td>
                        <span className="mono">{batch.completed_runs}/{batch.total_runs}</span>
                        {batch.failed_runs > 0 && (
                          <span className="text-xs muted"> ({batch.failed_runs} failed)</span>
                        )}
                      </td>
                      <td>{formatRelative(batch.started_at || batch.created_at)}</td>
                      <td>{batch.completed_at && batch.started_at
                        ? formatDuration(new Date(batch.completed_at).getTime() - new Date(batch.started_at).getTime())
                        : '—'}</td>
                      <td className="col-actions">
                        <button
                          type="button"
                          className="btn btn-sm btn-ghost runs-archive-btn"
                          disabled={!canManageBenchmarks || archivingBatchId === batch.id}
                          title={!canManageBenchmarks ? benchmarkActionDisabledReason : 'Archive batch'}
                          onClick={(e) => archiveBatch(e, batch.id)}>
                          {archivingBatchId === batch.id ? <Icon name="refresh" size={13} /> : <Icon name="archive" size={13} />}
                        </button>
                      </td>
                    </tr>
                  );
                })}
              </tbody>
            </table>
          </div>
        )}
      </section>

      {inlinedBatchId && (
        <BenchmarkBatchInlinePanel
          key={inlinedBatchId}
          batchId={inlinedBatchId}
          navigate={navigate} />
      )}

      {runOpen && (
        <BenchmarkRunDialog
          onClose={() => setRunOpen(false)}
          saving={runSaving}
          sources={allSourceRows}
          models={catalog.data?.models || []}
          defaults={runDefaults}
          onSubmit={runBatch} />
      )}

      {createOpen && (
        <BenchmarkWorkflowCreateDialog
          onClose={() => setCreateOpen(false)}
          saving={creating}
          sources={allSourceRows}
          models={catalog.data?.models || []}
          onSubmit={createBenchmarkWorkflow} />
      )}
    </div>
  );
}

// Pull the most recent batch's mcp_server_ids + tester_targets so the
// "Run benchmark" modal opens pre-filled with the last config the user
// dispatched. listBatchesForWorkflow returns batches DESC by created_at
// so we just take row 0.
function pickRunDialogDefaults(batchRows) {
  if (!batchRows || batchRows.length === 0) return null;
  const latest = batchRows[0];
  const mcpServerIds = Array.isArray(latest.mcp_server_ids) && latest.mcp_server_ids.length > 0
    ? latest.mcp_server_ids
    : null;
  const testerTargets = Array.isArray(latest.tester_targets) && latest.tester_targets.length > 0
    ? latest.tester_targets.map((t) => ({
      harness: t.harness || t.harness_key || 'claude_code',
      modelId: t.modelId || t.model_id,
    })).filter((t) => t.modelId)
    : null;
  if (!mcpServerIds && !testerTargets) return null;
  return {
    mcpServerIds: mcpServerIds || [],
    testerTargets: testerTargets || [],
  };
}

// ---------------------------------------------------------------------------
// Title-styled workflow switcher
// ---------------------------------------------------------------------------
//
// The detail page's H1 is also the workflow picker. We render a real
// <button> styled with the page-title typography (so screen readers still
// announce it as the page heading via aria-haspopup="listbox"), and a
// popover menu that lists existing benchmarks plus a "+ New benchmark"
// action below a divider. Behaves like a <Select> but cannot be themed
// out of its small-input chrome — hence a bespoke component.

function BenchmarkTitleSwitcher({
  currentId,
  currentLabel,
  options,
  onSelect,
  onCreateNew,
  createDisabled = false,
  createDisabledHint = '',
}) {
  const [open, setOpen] = useStateB(false);
  const rootRef = useRefB(null);

  useEffectB(() => {
    if (!open) return undefined;
    function onDocClick(event) {
      if (!rootRef.current?.contains(event.target)) setOpen(false);
    }
    function onKey(event) {
      if (event.key === 'Escape') {
        event.stopPropagation();
        setOpen(false);
      }
    }
    document.addEventListener('mousedown', onDocClick);
    document.addEventListener('keydown', onKey);
    return () => {
      document.removeEventListener('mousedown', onDocClick);
      document.removeEventListener('keydown', onKey);
    };
  }, [open]);

  return (
    <div
      ref={rootRef}
      className={`benchmark-title-switcher${open ? ' is-open' : ''}`}>
      <button
        type="button"
        className="benchmark-title-switcher__trigger"
        aria-haspopup="listbox"
        aria-expanded={open}
        aria-label="Switch benchmark workflow"
        onClick={() => setOpen((v) => !v)}>
        <span className="benchmark-title-switcher__label">{currentLabel}</span>
        <Icon name="chevronDown" size={14} className="benchmark-title-switcher__chevron" />
      </button>
      {open && (
        <div
          className="benchmark-title-switcher__menu"
          role="listbox"
          aria-label="Switch benchmark workflow">
          {options.length === 0 ? (
            <div className="benchmark-title-switcher__empty">No other benchmarks yet</div>
          ) : options.map((opt) => {
            const selected = opt.value === currentId;
            return (
              <button
                key={opt.value}
                type="button"
                role="option"
                aria-selected={selected}
                className={`benchmark-title-switcher__option${selected ? ' is-selected' : ''}`}
                onClick={() => {
                  setOpen(false);
                  if (!selected) onSelect(opt.value);
                }}>
                <span className="benchmark-title-switcher__option-label">{opt.label}</span>
                {selected && <Icon name="check" size={12} className="benchmark-title-switcher__tick" />}
              </button>
            );
          })}
          <div className="benchmark-title-switcher__divider" role="separator" />
          <button
            type="button"
            className="benchmark-title-switcher__option benchmark-title-switcher__option--action"
            disabled={createDisabled}
            title={createDisabled ? createDisabledHint : undefined}
            onClick={() => {
              if (createDisabled) return;
              setOpen(false);
              onCreateNew();
            }}>
            <Icon name="plus" size={13} />
            <span className="benchmark-title-switcher__option-label">New benchmark</span>
          </button>
        </div>
      )}
    </div>
  );
}

// ---------------------------------------------------------------------------
// Inline batch panel
// ---------------------------------------------------------------------------
//
// Renders the per-batch summary tiles, AI cross-source recap, and the
// per-cell leaderboard table. Each cell row links DIRECTLY to /runs/:id
// (no drawer / popover); the run trace page surfaces all per-run benchmark
// detail that used to live in the legacy slide-over.
//
// The whole panel sits inside an "offset card" wrapper so the current
// batch reads as visually distinct from the past-batches table above —
// see styles.css `.benchmark-current-batch-card`.

// System-error statuses we let users replay one at a time. Must match
// REPLAYABLE_FAILED_RUN_STATUSES on the server; the per-row "Replay" icon
// is only rendered when a run's status is in this set, so the affordance
// can't render somewhere the endpoint would reject (`canceled` runs and
// completed runs with a failing verdict are intentionally excluded).
const REPLAYABLE_RUN_STATUSES_UI = new Set(['tester_failed', 'evaluation_failed', 'timed_out']);

function BenchmarkBatchInlinePanel({ batchId, navigate }) {
  const auth = useAuth();
  const toast = useToast();
  const canManageBenchmarks = canUseTenantAction(auth);
  const benchmarkActionDisabledReason = tenantActionDisabledReason(auth);
  const batch = useApiResource(`/api/benchmark/batches/${batchId}`, [batchId]);
  const [sortKey, setSortKey] = useStateB(DEFAULT_CELLS_SORT.key);
  const [sortDir, setSortDir] = useStateB(DEFAULT_CELLS_SORT.dir);
  // `replayTarget` is the full run row the user clicked "Replay" on, so
  // the confirmation modal can quote its source + harness + model.
  // Setting it to null closes the modal.
  const [replayTarget, setReplayTarget] = useStateB(null);
  // Track *each* in-flight replay independently — the modal can be
  // re-opened (and another row's replay launched) while a previous
  // request is still pending. A single scalar would let request A's
  // `finally` clear request B's busy flag and re-enable B's button
  // mid-flight.
  const [replayingRunIds, setReplayingRunIds] = useStateB(() => new Set());
  const sortIsDefault = isDefaultSort(sortKey, sortDir, DEFAULT_CELLS_SORT);
  function resetSort() {
    setSortKey(DEFAULT_CELLS_SORT.key);
    setSortDir(DEFAULT_CELLS_SORT.dir);
  }

  async function confirmReplayRun() {
    if (!replayTarget) return;
    if (!requirePaidAction(auth, navigate, { requiredRole: 'editor' })) return;
    const runId = replayTarget.id;
    setReplayingRunIds((prev) => {
      const next = new Set(prev);
      next.add(runId);
      return next;
    });
    setReplayTarget(null);
    try {
      const result = await apiFetch(`/api/benchmark/runs/${runId}/replay`, {
        method: 'POST',
      });
      const status = result?.run?.status;
      if (status === 'queued') {
        toast.show({ tone: 'ok', title: 'Replay started', description: 'The cell is back in the queue.' });
      } else {
        toast.show({
          tone: 'bad',
          title: 'Replay dispatch failed',
          description: result?.run?.errorMessage || 'Could not enqueue the run.',
        });
      }
      batch.reload();
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Replay failed', description: err.message });
      // The reset transaction may have committed before the SQS dispatch
      // (or the server itself) failed — in that case the run is already
      // back in `running_tester` with `failed_runs` decremented. Reload
      // so the panel reflects the actual state.
      batch.reload();
    } finally {
      setReplayingRunIds((prev) => {
        if (!prev.has(runId)) return prev;
        const next = new Set(prev);
        next.delete(runId);
        return next;
      });
    }
  }

  // Auto-refresh while the batch is still active so users see runs land.
  useEffectB(() => {
    if (!batchId) return undefined;
    const status = batch.data?.batch?.status;
    if (!status || status === 'completed' || status === 'failed' || status === 'partially_failed' || status === 'canceled') {
      return undefined;
    }
    const timer = window.setInterval(() => {
      if (document.visibilityState !== 'hidden') batch.reload();
    }, 5000);
    return () => window.clearInterval(timer);
  }, [batchId, batch.data?.batch?.status, batch.reload]);

  const runs = useMemoB(() => batch.data?.runs || [], [batch.data]);
  const sortedRuns = useMemoB(() => sortBenchmarkRuns(runs, sortKey, sortDir), [runs, sortKey, sortDir]);

  function setSort(key) {
    if (sortKey === key) {
      setSortDir((d) => (d === 'desc' ? 'asc' : 'desc'));
      return;
    }
    setSortKey(key);
    setSortDir(BENCHMARK_DESC_DEFAULT_KEYS.has(key) ? 'desc' : 'asc');
  }

  if (batch.loading && !batch.data) {
    return (
      <section className="benchmark-current-batch-card">
        <div className="benchmark-current-batch-card__head">
          <span className="benchmark-current-batch-card__eyebrow">Current batch</span>
          <span className="mono benchmark-current-batch-card__id">{shortId(batchId)}</span>
        </div>
        <div className="benchmark-current-batch-card__body">
          <div className="benchmark-loader benchmark-loader--small"><LoadingSpinner /></div>
        </div>
      </section>
    );
  }

  if (batch.error) {
    return (
      <section className="benchmark-current-batch-card">
        <div className="benchmark-current-batch-card__head">
          <span className="benchmark-current-batch-card__eyebrow">Current batch</span>
          <span className="mono benchmark-current-batch-card__id">{shortId(batchId)}</span>
        </div>
        <div className="benchmark-current-batch-card__body">
          <div className="card benchmark-empty-card">
            <EmptyState
              icon="alert"
              title="Couldn't load this batch"
              body={batch.error.message || 'Try reloading the page.'} />
          </div>
        </div>
      </section>
    );
  }

  const batchRow = batch.data?.batch || {};
  const comparisonRecap = batch.data?.comparisonRecap || null;
  const replayDisabledReason = !canManageBenchmarks ? benchmarkActionDisabledReason : undefined;

  return (
    <section className="benchmark-current-batch-card">
      <div className="benchmark-current-batch-card__head">
        <div className="benchmark-current-batch-card__head-left">
          <span className="benchmark-current-batch-card__eyebrow">Current batch</span>
          <span className="mono benchmark-current-batch-card__id" title={batchId}>
            {shortId(batchId)}
          </span>
          <span className="benchmark-current-batch-card__sub text-xs muted">
            started {formatDateTime(batchRow.started_at || batchRow.created_at)}
            {batchRow.triggered_by_email && ` · by ${batchRow.triggered_by_display_name || batchRow.triggered_by_email}`}
          </span>
        </div>
      </div>

      <div className="benchmark-current-batch-card__body">
        <div className="benchmark-summary-grid">
          <BenchmarkStat label="Cells" value={String(batchRow.total_runs || 0)} />
          <BenchmarkStat label="Completed" value={String(batchRow.completed_runs || 0)} />
          <BenchmarkStat label="Failed" value={String(batchRow.failed_runs || 0)} />
          <BenchmarkStat
            label="Status"
            value={<BenchmarkBatchStatusPill status={batchRow.status} />} />
        </div>

        <BenchmarkRecapCard recap={comparisonRecap} />

        <div className="ui-section-head benchmark-section-head--sub">
          <div className="ui-section-title">Cells</div>
          <span
            className="text-xs muted"
            title="Default: best score first, fastest first. Click any column to re-sort.">
            {runs.length} run{runs.length === 1 ? '' : 's'}
          </span>
        </div>

        {runs.length === 0 ? (
          <div className="card benchmark-empty-card">
            <EmptyState
              icon="activity"
              title="No runs yet"
              body="Cells are queued. They'll appear here as they finish." />
          </div>
        ) : (
          <div className="card benchmark-table-wrap">
            <BenchmarkTableToolbar visible={!sortIsDefault} onReset={resetSort} />
            <table className="ui-table benchmark-cells-table">
              <thead>
                <tr>
                  <th className="benchmark-table-row-num">#</th>
                  <BenchmarkSortHeader label="Source" sortKey="source" current={sortKey} dir={sortDir} onSort={setSort} />
                  <BenchmarkSortHeader label="Harness" sortKey="model" current={sortKey} dir={sortDir} onSort={setSort} />
                  <BenchmarkSortHeader label="Score" sortKey="score" current={sortKey} dir={sortDir} onSort={setSort} />
                  <BenchmarkSortHeader label="Duration" sortKey="duration" current={sortKey} dir={sortDir} onSort={setSort} />
                  <BenchmarkSortHeader label="Tools" sortKey="tools" current={sortKey} dir={sortDir} onSort={setSort} />
                  <BenchmarkSortHeader label="In tokens" sortKey="input_tokens" current={sortKey} dir={sortDir} onSort={setSort} />
                  <BenchmarkSortHeader label="Out tokens" sortKey="output_tokens" current={sortKey} dir={sortDir} onSort={setSort} />
                  <BenchmarkSortHeader label="Status" sortKey="status" current={sortKey} dir={sortDir} onSort={setSort} />
                </tr>
              </thead>
              <tbody>
                {sortedRuns.map((run, idx) => {
                  const isReplayable = REPLAYABLE_RUN_STATUSES_UI.has(run.status);
                  const isReplaying = replayingRunIds.has(run.id);
                  return (
                    <tr
                      key={run.id}
                      className="ui-table-row-clickable"
                      onClick={() => navigate(`/runs/${run.id}`)}>
                      <td>{idx + 1}</td>
                      <td>{run.mcp_server_name || '—'}</td>
                      <td>
                        <BenchmarkHarnessCell
                          harnessKey={run.tester_harness}
                          sdkKey={run.tester_sdk_key}
                          sdkName={run.tester_sdk_display_name}
                          modelName={run.tester_model_name} />
                      </td>
                      <td><BenchmarkScoreChip score={run.evaluation_score} /></td>
                      <td>{formatDuration(run.duration_ms) || '—'}</td>
                      <td>{run.tool_call_count || 0}</td>
                      <td>{formatNumber(run.input_tokens) || '0'}</td>
                      <td>{formatNumber(run.output_tokens) || '0'}</td>
                      <td>
                        <div className="benchmark-status-cell">
                          <BenchmarkRunStatusPill run={run} />
                          {isReplayable && (
                            <button
                              type="button"
                              className="btn btn-sm benchmark-replay-btn"
                              disabled={!canManageBenchmarks || isReplaying}
                              title={replayDisabledReason || 'Replay this run'}
                              data-row-action
                              onClick={(e) => {
                                e.stopPropagation();
                                if (!canManageBenchmarks) return;
                                setReplayTarget(run);
                              }}>
                              <Icon name="refresh" size={12} />
                              {isReplaying ? 'Replaying…' : 'Replay'}
                            </button>
                          )}
                        </div>
                      </td>
                    </tr>
                  );
                })}
              </tbody>
            </table>
          </div>
        )}
      </div>
      <ConfirmDialog
        open={Boolean(replayTarget)}
        tone="default"
        title="Replay this failed run?"
        description={
          replayTarget
            ? `This will re-run ${replayTarget.mcp_server_name || 'this source'} with `
              + `${replayTarget.tester_model_name || replayTarget.tester_harness || 'the same tester'}. `
              + 'The prior attempt is discarded. Heads up: replaying individual failures can bias the batch’s '
              + 'success rate — successes don’t get a second chance. Prefer a fresh batch for clean comparisons.'
            : ''
        }
        confirmLabel="Replay run"
        confirmBusyLabel="Replaying..."
        // Modal closes synchronously on confirm; the per-row button
        // shows the in-flight spinner. Keeping busy=false here means a
        // user can immediately queue up another row's replay without
        // waiting for the first request to return.
        busy={false}
        onCancel={() => setReplayTarget(null)}
        onConfirm={confirmReplayRun} />
    </section>
  );
}

// ---------------------------------------------------------------------------
// "New benchmark" modal.
//   Authoring — same assisted/manual flow used by "New workflow", with
//               benchmark-specific Next actions instead of create actions.
//   Matrix    — selected sources + tester targets/harnesses/models multi-add.
// On submit the parent creates the workflow AND (unless the user picks
// "Create only") dispatches its first batch in the same flow.
// ---------------------------------------------------------------------------

function isHttpMcpSource(row) {
  return (row?.target_kind || 'mcp') === 'mcp' && row?.transport_type !== 'stdio_hosted';
}

function BenchmarkWorkflowCreateDialog({ onClose, saving, sources, models, onSubmit }) {
  const auth = useAuth();
  const toast = useToast();
  const productionWorkflows = useApiResource('/api/workflows');
  const [stage, setStage] = useStateB('sources'); // 'sources' | 'choose' | 'agentStart' | 'usecase' | 'review' | 'manual' | 'matrix'
  const [animDir, setAnimDir] = useStateB('forward');
  const [path, setPath] = useStateB('existing'); // 'existing' | 'auto' | 'manual'
  const [existingWorkflowId, setExistingWorkflowId] = useStateB('');
  const [agentIdeaMode, setAgentIdeaMode] = useStateB('direct');
  const [working, setWorking] = useStateB(false);
  const [returnStage, setReturnStage] = useStateB('choose');
  const [basics, setBasics] = useStateB({
    name: '',
    description: '',
    testerPrompt: '',
    criteria: [],
  });
  const usableSources = useMemoB(() => sources.filter((row) => row.is_active !== false), [sources]);
  const [selectedSourceIds, setSelectedSourceIds] = useStateB([]);
  const [testerTargets, setTesterTargets] = useStateB([]);
  const [originFilter, setOriginFilter] = useStateB('all');
  const [search, setSearch] = useStateB('');
  const evaluatorModelId = useMemoB(() => pickFixedEvaluatorModelId(models), [models]);
  const selectedSources = useMemoB(
    () => usableSources.filter((row) => selectedSourceIds.includes(row.id)),
    [usableSources, selectedSourceIds],
  );
  const selectedDraftSource = useMemoB(() => (
    selectedSources.find((row) => row._origin === 'production' && isHttpMcpSource(row))
    || selectedSources.find((row) => isHttpMcpSource(row))
    || selectedSources.find((row) => row._origin === 'production' && (row.target_kind || 'mcp') === 'mcp')
    || selectedSources.find((row) => (row.target_kind || 'mcp') === 'mcp')
    || selectedSources[0]
    || null
  ), [selectedSources]);
  const flow = useWorkflowDraftFlow({
    mcpServerId: selectedDraftSource?.id || '',
    mcpServerIds: selectedSourceIds,
    modelRows: models,
    orgDefaultTargets: auth?.me?.organization?.workflowDefaultTargets,
    selectedServer: selectedDraftSource,
    toast,
    onCreated: () => undefined,
    agentSuggestEndpoint: '/api/benchmark/workflows/agent-suggest-usecases',
    agentDraftEndpoint: '/api/benchmark/workflows/agent-draft',
  });
  const {
    usecase, setUsecase,
    suggestState,
    draftState,
    draft, setDraft,
    manualDraft, setManualDraft,
    ensureManualDraft,
    resetAgentDraft,
    prepareDirectUsecaseDraft,
    prepareSuggestedUsecases,
    restoreDirectUsecaseDraft,
    suggestUsecases,
    resuggestUsecases,
    loadAgentDraft,
  } = flow;
  const workflowOptions = (productionWorkflows.data?.rows || []).map((workflow) => ({
    value: workflow.id,
    label: workflow.name || 'Untitled workflow',
  }));

  function goTo(nextStage, dir = 'forward') {
    setAnimDir(dir);
    setStage(nextStage);
  }

  useEffectB(() => {
    if (stage !== 'manual') return;
    ensureManualDraft();
  }, [stage]); // eslint-disable-line

  useEffectB(() => {
    if (stage !== 'review') return;
    if (draftState.status === 'loading' || draftState.status === 'ready') return;
    loadAgentDraft();
  }, [stage]); // eslint-disable-line

  useEffectB(() => {
    if (testerTargets.length === 0 && models.length > 0) {
      const defaults = pickDefaultBenchmarkTesterTargets(models);
      if (defaults.length > 0) setTesterTargets(defaults);
    }
  }, [models]); // eslint-disable-line

  const step2Invalid = selectedSourceIds.length === 0 || testerTargets.length === 0 || !basics.testerPrompt.trim();
  const sourceStepInvalid = selectedSourceIds.length === 0;
  const totalCells = selectedSourceIds.length * testerTargets.length;

  function criteriaFromText(text) {
    return (text || '').split('\n').map((line) => line.trim()).filter(Boolean);
  }

  function criteriaFromRows(rows) {
    return (rows || [])
      .map((criterion) => (criterion?.criterion_text || criterion?.criterionText || '').trim())
      .filter(Boolean);
  }

  async function continueFromExisting() {
    if (!existingWorkflowId || working) return;
    setWorking(true);
    try {
      const detail = await apiFetch(`/api/workflows/${existingWorkflowId}`);
      const workflow = detail.workflow || {};
      const version = detail.currentVersion || {};
      const testerPrompt = (version.tester_prompt || '').trim();
      if (!testerPrompt) {
        toast.show({
          tone: 'bad',
          title: 'Workflow is missing a tester prompt',
          description: 'Choose a workflow with a tester prompt or define this benchmark manually.',
        });
        return;
      }
      setBasics({
        name: workflow.name || 'Benchmark workflow',
        description: workflow.description || '',
        testerPrompt,
        criteria: criteriaFromRows(detail.criteria),
      });
      setReturnStage('choose');
      goTo('matrix', 'forward');
    } catch (err) {
      toast.show({ tone: 'bad', title: 'Could not load workflow', description: err.message });
    } finally {
      setWorking(false);
    }
  }

  function continueFromAgent() {
    if (!draft?.name?.trim() || !draft?.testerPrompt?.trim()) return;
    setBasics({
      name: draft.name.trim(),
      description: (draft.description || '').trim(),
      testerPrompt: draft.testerPrompt.trim(),
      criteria: (draft.criteria || [])
        .map((criterion) => (criterion?.criterionText || '').trim())
        .filter(Boolean),
    });
    setReturnStage('review');
    goTo('matrix', 'forward');
  }

  function continueFromManual() {
    if (!manualDraft?.name?.trim() || !manualDraft?.testerPrompt?.trim()) return;
    setBasics({
      name: manualDraft.name.trim(),
      description: (manualDraft.description || '').trim(),
      testerPrompt: manualDraft.testerPrompt.trim(),
      criteria: criteriaFromText(manualDraft.criteriaText),
    });
    setReturnStage('manual');
    goTo('matrix', 'forward');
  }

  function buildSpec({ runFirstBatch }) {
    return {
      spec: {
        name: basics.name.trim(),
        description: basics.description.trim() || null,
        mcpServerId: null,
        defaultMcpAuthProfileId: null,
        testerPrompt: basics.testerPrompt.trim(),
        testerTargets,
        testerModelIds: testerTargets.map((t) => t.modelId),
        evaluatorModelId: evaluatorModelId || undefined,
        criteria: basics.criteria,
        schedule: null,
        isActive: true,
        isBenchmark: true,
        mcpServerIds: selectedSourceIds,
      },
      options: { runFirstBatch },
    };
  }

  function submitCreateOnly() {
    if (step2Invalid || saving) return;
    const { spec, options } = buildSpec({ runFirstBatch: false });
    onSubmit(spec, options);
  }

  function submitCreateAndRun(event) {
    event?.preventDefault?.();
    if (step2Invalid || saving) return;
    const { spec, options } = buildSpec({ runFirstBatch: true });
    onSubmit(spec, options);
  }

  useEscapeToClose({
    disabled: saving,
    onClose,
  });

  const headerTitle = stage === 'sources'
    ? 'Pick sources'
    : stage === 'matrix'
      ? 'Pick harnesses and models'
      : 'New benchmark';
  const headerSubtitle = stage === 'sources'
    ? 'Choose the MCP servers, benchmark targets, or CLIs before defining the workflow.'
    : stage === 'matrix'
      ? `Each (source × target) combination runs as one cell.${totalCells > 0 ? ` ${totalCells} cell${totalCells === 1 ? '' : 's'} will be queued.` : ''}`
      : 'Choose or create the workflow definition using the selected source context.';

  const isWide = stage === 'sources' || stage === 'manual' || stage === 'matrix';

  return (
    <div className="dialog-backdrop" onMouseDown={(e) => {
      if (e.target === e.currentTarget && !saving) onClose();
    }}>
      <div
        className={`modal-panel workflow-onb-modal benchmark-create-modal${isWide ? ' is-wide' : ''}`}
        role="dialog"
        aria-modal="true">
        <div className="modal-header">
          <div>
            <div className="modal-title">{headerTitle}</div>
            <div className="modal-subtitle">{headerSubtitle}</div>
          </div>
          <button className="icon-btn" type="button" aria-label="Close" disabled={saving} onClick={onClose}>
            <Icon name="x" size={15} />
          </button>
        </div>

        <div className="workflow-onb-stage-wrap">
          {stage === 'sources' && (
            <div className={`workflow-onb-stage benchmark-create-stage ${animDir === 'back' ? 'is-entering-back' : 'is-entering'}`}>
              <BenchmarkMatrixPicker
                sources={usableSources}
                models={models}
                selectedSourceIds={selectedSourceIds}
                setSelectedSourceIds={setSelectedSourceIds}
                testerTargets={testerTargets}
                setTesterTargets={setTesterTargets}
                originFilter={originFilter}
                setOriginFilter={setOriginFilter}
                search={search}
                setSearch={setSearch}
                totalCells={totalCells}
                saving={saving}
                showTargets={false} />
              <div className="workflow-onb-footer" style={{ borderTop: 'none', padding: 0, marginTop: 4 }}>
                <div className="workflow-onb-footer-left" />
                <div className="workflow-onb-footer-right">
                  <Button disabled={saving} onClick={onClose}>Cancel</Button>
                  <Button
                    variant="primary"
                    disabled={sourceStepInvalid || saving}
                    onClick={() => goTo('choose', 'forward')}>
                    Next<Icon name="chevronRight" size={12} />
                  </Button>
                </div>
              </div>
            </div>
          )}
          {stage === 'choose' && (
            <BenchmarkWorkflowChoiceStage
              animDir={animDir}
              path={path}
              onPickPath={setPath}
              workflowOptions={workflowOptions}
              workflowLoading={productionWorkflows.loading && !productionWorkflows.data}
              workflowError={productionWorkflows.error}
              onRetryWorkflows={productionWorkflows.reload}
              existingWorkflowId={existingWorkflowId}
              setExistingWorkflowId={setExistingWorkflowId}
              working={working}
              onBack={() => goTo('sources', 'back')}
              onContinue={() => {
                if (path === 'existing') continueFromExisting();
                else goTo(path === 'manual' ? 'manual' : 'agentStart', 'forward');
              }} />
          )}
          {stage === 'agentStart' && (
            <WorkflowOnboardingAgentStart
              animDir={animDir}
              usecase={usecase}
              setUsecase={setUsecase}
              onBack={() => goTo('choose', 'back')}
              onGenerateIdeas={() => {
                prepareSuggestedUsecases();
                setAgentIdeaMode('suggested');
                goTo('usecase', 'forward');
              }}
              onContinue={() => {
                prepareDirectUsecaseDraft();
                setAgentIdeaMode('direct');
                goTo('review', 'forward');
              }}
              continueLabel="Generate benchmark" />
          )}
          {stage === 'usecase' && (
            <WorkflowOnboardingUseCase
              animDir={animDir}
              server={selectedDraftSource}
              suggestState={suggestState}
              usecase={usecase}
              setUsecase={setUsecase}
              onSuggest={suggestUsecases}
              onResuggest={resuggestUsecases}
              onBack={() => { restoreDirectUsecaseDraft(); goTo('agentStart', 'back'); }}
              onSwitchManual={() => goTo('manual', 'forward')}
              onContinue={() => {
                resetAgentDraft();
                setAgentIdeaMode('suggested');
                goTo('review', 'forward');
              }} />
          )}
          {stage === 'review' && (
            <WorkflowOnboardingReview
              animDir={animDir}
              draftState={draftState}
              draft={draft}
              setDraft={setDraft}
              testerModelIds={[]}
              testerTargets={[]}
              onChangeTesterModels={() => undefined}
              onChangeTesterTargets={() => undefined}
              models={models}
              saving={saving}
              onBack={() => goTo(agentIdeaMode === 'suggested' ? 'usecase' : 'agentStart', 'back')}
              onRetry={loadAgentDraft}
              onSwitchManual={() => goTo('manual', 'forward')}
              onCreate={continueFromAgent}
              hideTargetsAndFrequency
              primaryActionLabel="Next"
              primaryActionIcon="chevronRight"
              reviewActionText="continuing" />
          )}
          {stage === 'manual' && manualDraft && (
            <WorkflowOnboardingManual
              animDir={animDir}
              server={null}
              draft={manualDraft}
              setDraft={setManualDraft}
              testerModelIds={[]}
              testerTargets={[]}
              onChangeTesterModels={() => undefined}
              onChangeTesterTargets={() => undefined}
              models={models}
              saving={saving}
              onBack={() => goTo('choose', 'back')}
              onCreate={continueFromManual}
              hideServerHeader
              hideAuthProfile
              hideTargets
              hideFrequency
              primaryActionLabel="Next"
              primaryActionIcon="chevronRight" />
          )}
          {stage === 'matrix' && (
            <div className={`workflow-onb-stage benchmark-create-stage ${animDir === 'back' ? 'is-entering-back' : 'is-entering'}`}>
              <BenchmarkMatrixPicker
                sources={usableSources}
                models={models}
                selectedSourceIds={selectedSourceIds}
                setSelectedSourceIds={setSelectedSourceIds}
                testerTargets={testerTargets}
                setTesterTargets={setTesterTargets}
                originFilter={originFilter}
                setOriginFilter={setOriginFilter}
                search={search}
                setSearch={setSearch}
                totalCells={totalCells}
                saving={saving}
                showSources={false} />
              <div className="workflow-onb-footer" style={{ borderTop: 'none', padding: 0, marginTop: 4 }}>
                <div className="workflow-onb-footer-left">
                  <Button disabled={saving} onClick={() => goTo(returnStage, 'back')}>
                    <Icon name="chevronLeft" size={12} />Back
                  </Button>
                </div>
                <div className="workflow-onb-footer-right">
                  <Button
                    disabled={saving || step2Invalid}
                    onClick={submitCreateOnly}>
                    Create only
                  </Button>
                  <Button
                    variant="primary"
                    disabled={step2Invalid}
                    loading={saving}
                    loadingLabel="Starting..."
                    onClick={submitCreateAndRun}>
                    <Icon name="play" size={12} />
                    Create &amp; run {totalCells > 0 ? `${totalCells} cell${totalCells === 1 ? '' : 's'}` : 'batch'}
                  </Button>
                </div>
              </div>
            </div>
          )}
        </div>
      </div>
    </div>
  );
}

function BenchmarkWorkflowChoiceStage({
  animDir,
  path,
  onPickPath,
  workflowOptions,
  workflowLoading,
  workflowError,
  onRetryWorkflows,
  existingWorkflowId,
  setExistingWorkflowId,
  working,
  onBack,
  onContinue,
}) {
  const canContinue = path !== 'existing' || Boolean(existingWorkflowId);
  const showWorkflowLoader = workflowLoading && workflowOptions.length === 0;
  return (
    <div className={`workflow-onb-stage ${animDir === 'back' ? 'is-entering-back' : 'is-entering'}`}>
      <div className="workflow-onb-field">
        <div className="workflow-onb-field-label">Choose or define a workflow</div>
        <div className="workflow-onb-field">
          {showWorkflowLoader && (
            <div className="benchmark-workflow-loading" role="status">
              <LoadingSpinner size="sm" decorative />
              <span>Loading workflows...</span>
            </div>
          )}
          <Select
            className={`benchmark-workflow-select ${path === 'existing' ? 'is-selected' : ''}`}
            value={existingWorkflowId}
            onChange={(value) => {
              setExistingWorkflowId(value);
              onPickPath('existing');
            }}
            placeholder="Use an existing workflow"
            disabled={workflowLoading || working}
            options={workflowOptions} />
          {workflowError && !workflowLoading && (
            <div className="benchmark-workflow-load-error">
              <div>
                <div className="text-xs font-semibold">Could not load workflows</div>
                <div className="text-xs muted">{workflowError.message || 'Try again or use Armature/manual creation.'}</div>
              </div>
              <Button size="sm" disabled={working} onClick={onRetryWorkflows}>Retry</Button>
            </div>
          )}
          {workflowOptions.length === 0 && !workflowLoading && !workflowError && (
            <div className="text-xs muted">No production workflows are available yet.</div>
          )}
        </div>
        <div className="workflow-onb-path-cards">
          <button
            type="button"
            className={`workflow-onb-path-card ${path === 'auto' ? 'is-selected' : ''}`}
            onClick={() => onPickPath('auto')}>
            <div className="workflow-onb-path-card-mark"><Icon name="sparkle" size={14} /></div>
            <div style={{ flex: 1 }}>
              <div className="workflow-onb-path-card-title">
                Armature agent assisted workflow creation
              </div>
              <div className="workflow-onb-path-card-sub">
                Describe the benchmark task and let Armature draft it.
              </div>
            </div>
          </button>
          <button
            type="button"
            className={`workflow-onb-path-card ${path === 'manual' ? 'is-selected' : ''}`}
            onClick={() => onPickPath('manual')}>
            <div className="workflow-onb-path-card-mark"><Icon name="edit" size={13} /></div>
            <div style={{ flex: 1 }}>
              <div className="workflow-onb-path-card-title">Manual</div>
              <div className="workflow-onb-path-card-sub">
                Fill the workflow fields yourself.
              </div>
            </div>
          </button>
        </div>
      </div>

      <div className="workflow-onb-footer" style={{ borderTop: 'none', padding: 0, marginTop: 4 }}>
        <div className="workflow-onb-footer-left">
          <Button disabled={working} onClick={onBack}><Icon name="chevronLeft" size={12} />Back</Button>
        </div>
        <div className="workflow-onb-footer-right">
          <Button
            variant="primary"
            disabled={!canContinue || working}
            loading={working}
            loadingLabel="Loading"
            onClick={onContinue}>
            Next<Icon name="chevronRight" size={12} />
          </Button>
        </div>
      </div>
    </div>
  );
}

// ---------------------------------------------------------------------------
// Shared matrix picker + Run-batch dialog
// ---------------------------------------------------------------------------
//
// The "Run benchmark" dialog and the second step of the unified create
// modal render identical UI: a sources multi-select plus the tester targets
// multi-add. We factor that body into BenchmarkMatrixPicker and keep the
// dialog shell minimal.

function BenchmarkMatrixPicker({
  sources,
  models,
  selectedSourceIds,
  setSelectedSourceIds,
  testerTargets,
  setTesterTargets,
  originFilter,
  setOriginFilter,
  search,
  setSearch,
  totalCells,
  saving = false,
  showSources = true,
  showTargets = true,
}) {
  const filteredSources = useMemoB(() => {
    const q = (search || '').trim().toLowerCase();
    return sources.filter((row) => {
      if (originFilter === 'benchmark' && row._origin !== 'benchmark') return false;
      if (originFilter === 'internal' && row._origin !== 'production') return false;
      if (originFilter === 'mcp' && (row.target_kind || 'mcp') !== 'mcp') return false;
      if (originFilter === 'cli' && row.target_kind !== 'cli') return false;
      if (q && !(row.name || '').toLowerCase().includes(q)) return false;
      return true;
    });
  }, [sources, originFilter, search]);

  function toggleSource(id) {
    setSelectedSourceIds((current) => current.includes(id)
      ? current.filter((x) => x !== id)
      : [...current, id]);
  }

  function selectAllFiltered() {
    const ids = filteredSources.map((row) => row.id);
    setSelectedSourceIds((current) => Array.from(new Set([...current, ...ids])));
  }

  function clearSelection() {
    setSelectedSourceIds([]);
  }

  // Filter chip list. The two "kind" chips (MCPs only / CLIs only) sit
  // alongside the origin chips; the picker is single-select so only one
  // chip is ever active at a time.
  const filterChips = [
    { key: 'all', label: 'All' },
    { key: 'internal', label: 'Internal' },
    { key: 'benchmark', label: 'Benchmark' },
    { key: 'mcp', label: 'MCPs only' },
    { key: 'cli', label: 'CLIs only' },
  ];

  return (
    <div className="mcp-form">
      {showSources && (
      <section className="mcp-form-section">
        <div className="ui-section-head benchmark-matrix-head">
          <h4 className="mcp-form-section-title benchmark-matrix-head__title">
            Sources ({selectedSourceIds.length} selected{showTargets ? ` · ${totalCells} cell${totalCells === 1 ? '' : 's'}` : ''})
          </h4>
          <span className="text-xs benchmark-matrix-head__actions">
            <button type="button" className="link-btn" onClick={selectAllFiltered} disabled={saving}>
              Select all{filteredSources.length !== sources.length ? ' filtered' : ''}
            </button>
            <span className="muted"> · </span>
            <button type="button" className="link-btn" onClick={clearSelection} disabled={saving}>Clear</button>
          </span>
        </div>
        <p className="mcp-form-section-desc">
          Pick which MCP servers and CLI targets to compare. Internal sources run your production target;
          benchmark sources are competitor targets you've added for comparison.
        </p>

        <div className="benchmark-source-filters">
          {filterChips.map((opt) => (
            <button
              key={opt.key}
              type="button"
              onClick={() => setOriginFilter(opt.key)}
              disabled={saving}
              className={`benchmark-source-filter${originFilter === opt.key ? ' is-active' : ''}`}>
              {opt.label}
            </button>
          ))}
        </div>
        <input
          className="input benchmark-source-search"
          type="search"
          placeholder="Search sources"
          value={search}
          disabled={saving}
          onChange={(e) => setSearch(e.target.value)} />

        <div className="benchmark-pick-list">
          {filteredSources.length === 0 ? (
            <div className="benchmark-pick-empty">No sources match the current filter.</div>
          ) : filteredSources.map((row) => {
            const checked = selectedSourceIds.includes(row.id);
            const targetKind = row.target_kind || 'mcp';
            const isBenchmark = row._origin === 'benchmark';
            return (
              <label
                key={row.id}
                className={`benchmark-pick-row${checked ? ' is-checked' : ''}`}>
                <input
                  type="checkbox"
                  checked={checked}
                  disabled={saving}
                  onChange={() => toggleSource(row.id)} />
                <Icon
                  name={targetKind === 'cli' ? 'terminal' : 'mcp'}
                  size={14}
                  className="benchmark-pick-row__icon" />
                <div className="benchmark-pick-row__body">
                  <div className="benchmark-pick-row__name">{row.name}</div>
                  <div className="benchmark-pick-row__meta">
                    {targetKind.toUpperCase()}
                    {row.environment ? ` · ${row.environment}` : ''}
                  </div>
                </div>
                <Pill tone={isBenchmark ? 'warn' : 'info'}>
                  {isBenchmark ? 'Benchmark' : 'Internal'}
                </Pill>
              </label>
            );
          })}
        </div>
      </section>
      )}

      {showTargets && (
      <section className="mcp-form-section">
        <h4 className="mcp-form-section-title">
          Tester targets ({testerTargets.length} selected)
        </h4>
        <p className="mcp-form-section-desc">
          Each (source × tester) pair runs as one cell. Defaults to one target per active provider.
        </p>
        {/*
         * We reuse WorkflowTargetPicker (the production workflow editor's
         * harness/model picker) so harness rules stay consistent. The
         * benchmark batch payload uses {harness, modelId} per row, which
         * is exactly the shape WorkflowTargetPicker emits via
         * `onChangeTargets`.
         */}
        <WorkflowTargetPicker
          models={models}
          targetRows={testerTargets}
          modelIds={testerTargets.map((t) => t.modelId)}
          disabled={saving}
          onChange={() => {
            // No-op: WorkflowTargetPicker emits both `onChange(modelIds)` and
            // `onChangeTargets(rows)`. We only need the row shape because
            // benchmark batches dispatch on (harness, modelId) tuples.
          }}
          onChangeTargets={(rows) => setTesterTargets(rows.map((row) => ({
            harness: row.harness,
            modelId: row.modelId,
          })))} />
      </section>
      )}
    </div>
  );
}

function BenchmarkRunDialog({ onClose, saving, sources, models, onSubmit, defaults }) {
  const usableSources = useMemoB(() => sources.filter((row) => row.is_active !== false), [sources]);
  // Pre-fill from the most recent batch's config when available; otherwise
  // fall back to "all sources selected" + the default tester target set.
  const [selectedSourceIds, setSelectedSourceIds] = useStateB(() => {
    if (defaults?.mcpServerIds && defaults.mcpServerIds.length > 0) {
      return defaults.mcpServerIds.filter(
        (id) => usableSources.some((row) => row.id === id),
      );
    }
    return usableSources.map((row) => row.id);
  });
  const [testerTargets, setTesterTargets] = useStateB(() => {
    if (defaults?.testerTargets && defaults.testerTargets.length > 0) {
      return defaults.testerTargets;
    }
    return [];
  });
  const [originFilter, setOriginFilter] = useStateB('all');
  const [search, setSearch] = useStateB('');

  useEffectB(() => {
    if (testerTargets.length === 0 && models.length > 0) {
      const fallback = pickDefaultBenchmarkTesterTargets(models);
      if (fallback.length > 0) setTesterTargets(fallback);
    }
  }, [models]); // eslint-disable-line

  const totalCells = selectedSourceIds.length * testerTargets.length;
  const formInvalid = selectedSourceIds.length === 0 || testerTargets.length === 0;

  function submit(event) {
    event.preventDefault();
    if (formInvalid || saving) return;
    onSubmit({
      mcpServerIds: selectedSourceIds,
      testerTargets,
    });
  }

  useEscapeToClose({
    disabled: saving,
    onClose,
  });

  return (
    <div className="dialog-backdrop" onMouseDown={(e) => {
      if (e.target === e.currentTarget && !saving) onClose();
    }}>
      <form
        className="modal-panel mcp-connect-modal benchmark-create-modal"
        role="dialog"
        aria-modal="true"
        onSubmit={submit}>
        <div className="modal-header">
          <div>
            <div className="modal-title">Run benchmark</div>
            <div className="modal-subtitle">
              Each (source × model) combination runs as one cell.
              {totalCells > 0 && ` ${totalCells} cell${totalCells === 1 ? '' : 's'} will be queued.`}
            </div>
          </div>
          <button className="icon-btn" type="button" aria-label="Close" disabled={saving} onClick={onClose}>
            <Icon name="x" size={15} />
          </button>
        </div>
        <div className="modal-body">
          <BenchmarkMatrixPicker
            sources={usableSources}
            models={models}
            selectedSourceIds={selectedSourceIds}
            setSelectedSourceIds={setSelectedSourceIds}
            testerTargets={testerTargets}
            setTesterTargets={setTesterTargets}
            originFilter={originFilter}
            setOriginFilter={setOriginFilter}
            search={search}
            setSearch={setSearch}
            totalCells={totalCells}
            saving={saving} />
        </div>
        <div className="modal-actions">
          <Button disabled={saving} onClick={onClose}>Cancel</Button>
          <Button
            variant="primary"
            type="submit"
            disabled={formInvalid}
            loading={saving}
            loadingLabel="Queuing...">
            <Icon name="play" size={12} />
            Run {totalCells > 0 ? `${totalCells} cell${totalCells === 1 ? '' : 's'}` : 'benchmark'}
          </Button>
        </div>
      </form>
    </div>
  );
}

// ---------------------------------------------------------------------------
// Shared bits: pills, score chips, sortable table headers
// ---------------------------------------------------------------------------

function BenchmarkSortHeader({ label, sortKey, current, dir, onSort, className = '' }) {
  const active = current === sortKey;
  const ariaSort = active ? (dir === 'asc' ? 'ascending' : 'descending') : 'none';
  // Active columns show a solid ▲/▼; inactive columns ship a ↕ glyph that
  // sits at zero opacity until the user hovers the header — that way the
  // table doesn't read as visually noisy by default, but every clickable
  // column reveals its affordance on the first hover.
  const arrow = active ? (dir === 'asc' ? '▲' : '▼') : '↕';
  const headerClass = [
    'benchmark-sort-header',
    active ? 'is-active' : 'is-inactive',
    className,
  ].filter(Boolean).join(' ');
  return (
    <th
      role="columnheader"
      aria-sort={ariaSort}
      tabIndex={0}
      className={headerClass}
      onClick={() => onSort(sortKey)}
      onKeyDown={(event) => {
        if (event.key === 'Enter' || event.key === ' ') {
          event.preventDefault();
          onSort(sortKey);
        }
      }}>
      <span className="benchmark-sort-header__inner">
        {label}
        <span
          aria-hidden="true"
          className={`benchmark-sort-header__arrow${active ? '' : ' benchmark-sort-header__arrow--inactive'}`}>
          {arrow}
        </span>
      </span>
    </th>
  );
}

function BenchmarkTableToolbar({ visible, onReset, label = 'Reset sort' }) {
  if (!visible) return null;
  return (
    <div className="benchmark-table-toolbar">
      <button
        type="button"
        className="link-btn benchmark-table-toolbar__reset"
        onClick={onReset}>
        {label}
      </button>
    </div>
  );
}

function roundBenchmarkScore(score) {
  const n = Number(score);
  if (!Number.isFinite(n)) return null;
  return Math.round(n * 10) / 10;
}

function BenchmarkScoreChip({ score }) {
  if (score === null || score === undefined) return <span className="text-xs muted">—</span>;
  const tone = score >= 4 ? 'pass' : score === 3 ? 'warn' : 'fail';
  return (
    <span className={`score-chip score-chip-${tone}`} title={`Score ${score}/5`}>
      {score}/5
    </span>
  );
}

function BenchmarkRunStatusPill({ run }) {
  const status = run?.status || 'pending';
  if (status === 'completed') return <Pill tone="ok">completed</Pill>;
  if (status === 'tester_failed' || status === 'evaluation_failed') return <Pill tone="bad">{status.replace('_', ' ')}</Pill>;
  if (status === 'timed_out') return <Pill tone="bad">timed out</Pill>;
  if (status === 'canceled') return <Pill tone="neutral">canceled</Pill>;
  return <Pill tone="warn">{status}</Pill>;
}

function BenchmarkHarnessCell({ harnessKey, sdkKey, sdkName, modelName }) {
  const normalizedHarnessKey = normalizeHarnessKey(harnessKey);
  const normalizedSdkKey = normalizeHarnessKey(sdkKey);
  const logo = normalizedHarnessKey
    ? BENCHMARK_HARNESS_LOGOS[normalizedHarnessKey]
    : normalizedSdkKey
      ? BENCHMARK_SDK_LOGOS[normalizedSdkKey]
      : null;
  const harnessLabel = logo?.label || sdkName || harnessKey || sdkKey || null;
  if (!harnessLabel && !modelName) return <span className="muted text-xs">—</span>;
  return (
    <span className="harness-cell">
      <span className="harness-cell-glyph" aria-hidden>
        {logo
          ? <img src={logo.src} alt="" className="harness-cell-logo" width="14" height="14" />
          : <Icon name="cpu" size={12} />}
      </span>
      {harnessLabel && <span className="harness-cell-name">{harnessLabel}</span>}
      {modelName && <span className="harness-cell-model mono">{displayModelName(modelName)}</span>}
    </span>
  );
}

function normalizeHarnessKey(key) {
  const normalized = String(key || '').trim().toLowerCase();
  if (normalized === 'claude-code') return 'claude_code';
  return normalized;
}

function BenchmarkBatchStatusPill({ status }) {
  if (!status) return null;
  if (status === 'completed') return <Pill tone="ok">completed</Pill>;
  if (status === 'partially_failed') return <Pill tone="warn">partial</Pill>;
  if (status === 'failed') return <Pill tone="bad">failed</Pill>;
  if (status === 'canceled') return <Pill tone="neutral">canceled</Pill>;
  return <Pill tone="warn">{status}</Pill>;
}

function BenchmarkStat({ label, value }) {
  return (
    <div className="benchmark-summary-tile">
      <div className="benchmark-summary-tile__label">{label}</div>
      <div className="benchmark-summary-tile__value">{value}</div>
    </div>
  );
}

// Extract a single array field from a JSON body string by bracket-matching.
// Works even when the outer JSON is truncated, as long as the target field
// appears before the truncation point.
function extractJsonField(body, key) {
  // Search for the key preceded by '{' or ',' (with optional whitespace) to
  // avoid false matches when the key name appears inside a string value.
  const keyPattern = new RegExp('(?:^|[{,])\\s*"' + key + '"\\s*:', 'm');
  const match = keyPattern.exec(body);
  if (!match) return undefined;
  const keyIdx = match.index;
  const colonIdx = body.indexOf(':', keyIdx + match[0].lastIndexOf(key));
  if (colonIdx === -1) return undefined;
  // Find the opening bracket/brace (could be [ for arrays or " for strings)
  let i = colonIdx + 1;
  while (i < body.length && /\s/.test(body[i])) i++;
  if (i >= body.length) return undefined;
  const opener = body[i];
  if (opener === '"') {
    // String value — extract via regex
    const match = body.slice(i).match(/^"((?:[^"\\]|\\.)*)"/);
    if (!match) return undefined;
    try { return JSON.parse(`"${match[1]}"`); } catch (_) { return match[1]; }
  }
  if (opener !== '[' && opener !== '{') return undefined;
  const closer = opener === '[' ? ']' : '}';
  let depth = 0;
  let j = i;
  while (j < body.length) {
    const ch = body[j];
    if (ch === '"') { j++; while (j < body.length && body[j] !== '"') { if (body[j] === '\\') j++; j++; } }
    else if (ch === opener) depth++;
    else if (ch === closer) { depth--; if (depth === 0) { try { return JSON.parse(body.slice(i, j + 1)); } catch (_) { return undefined; } } }
    j++;
  }
  return undefined;
}

// Last-resort extractor for truncated LLM output: strips code fences then
// extracts each batch-comparison field independently via bracket matching,
// so fields that appear before the truncation point are recovered.
function tryExtractPartialBatchJson(text) {
  if (!text || typeof text !== 'string') return null;
  const body = text.trim()
    .replace(/^```(?:json)?\s*/m, '')
    .replace(/\s*```\s*$/m, '')
    .trim();
  if (!body.includes('"summary"') || !body.includes('"winners"')) return null;
  const summary = extractJsonField(body, 'summary');
  if (!summary || typeof summary !== 'string') return null;
  return {
    summary,
    winners:    extractJsonField(body, 'winners')    ?? [],
    losers:     extractJsonField(body, 'losers')     ?? [],
    similarities: extractJsonField(body, 'similarities') ?? [],
    differences:  extractJsonField(body, 'differences')  ?? [],
    root_causes:  extractJsonField(body, 'root_causes')  ?? [],
  };
}

// Try to parse JSON from any text — handles raw JSON, markdown code fences,
// or JSON embedded somewhere inside a longer string.
function tryExtractJson(text) {
  if (!text || typeof text !== 'string') return null;
  const t = text.trim();
  // 1. Direct JSON parse
  try { const p = JSON.parse(t); if (p && typeof p === 'object') return p; } catch (_) {}
  // 2. Strip markdown code fence (```json ... ``` or ``` ... ```)
  const stripped = t.replace(/^```(?:json)?\s*/m, '').replace(/\s*```\s*$/m, '').trim();
  try { const p = JSON.parse(stripped); if (p && typeof p === 'object') return p; } catch (_) {}
  // 3. Find the outermost {...} block anywhere in the text
  const start = t.indexOf('{');
  const end = t.lastIndexOf('}');
  if (start !== -1 && end > start) {
    try { const p = JSON.parse(t.slice(start, end + 1)); if (p && typeof p === 'object') return p; } catch (_) {}
  }
  return null;
}

function BenchmarkRecapCardTitle() {
  return (
    <span className="benchmark-recap-card__title">
      <Icon name="sparkle" size={14} className="benchmark-recap-card__title-icon" />
      AI Analysis
    </span>
  );
}

function BenchmarkRecapCard({ recap }) {
  const [simDiffTab, setSimDiffTab] = useStateB('similarities');

  if (!recap) {
    return (
      <section className="card benchmark-recap-card benchmark-recap-card--placeholder">
        <div className="card-header">
          <div className="card-title"><BenchmarkRecapCardTitle /></div>
        </div>
        <div className="benchmark-recap-card__hint text-sm muted">
          AI cross-comparison will be generated automatically when all cells finish.
        </div>
      </section>
    );
  }

  if (recap.status !== 'completed') {
    return (
      <section className="card benchmark-recap-card">
        <div className="card-header">
          <div className="card-title"><BenchmarkRecapCardTitle /></div>
        </div>
        <div className="benchmark-recap-card__hint text-sm muted">
          {recap.status === 'failed'
            ? `AI recap failed: ${recap.error_message || 'unknown error'}`
            : 'AI recap is generating…'}
        </div>
      </section>
    );
  }

  // Resolve structured data: details may already be parsed, or the LLM output
  // may have been stored unparsed in details.raw / recap.summary. Try each
  // source in order so existing records with un-extracted JSON still work.
  // We return any successfully-parsed object (not just batch-field ones) so
  // that non-batch per-run recaps also get d.summary / d.highlights populated.
  const d = (() => {
    const base = recap.details || {};
    const hasBatchFields = (o) => o && (
      Array.isArray(o.winners) || Array.isArray(o.losers) ||
      Array.isArray(o.similarities) || Array.isArray(o.differences) ||
      Array.isArray(o.root_causes)
    );
    if (hasBatchFields(base) || typeof base.summary === 'string' || Array.isArray(base.highlights)) return base;
    if (base.raw) { const p = tryExtractJson(base.raw); if (p) return p; }
    if (recap.summary) { const p = tryExtractJson(recap.summary); if (p) return p; }
    // Last resort: field-by-field extraction handles truncated LLM output
    if (base.raw) { const p = tryExtractPartialBatchJson(base.raw); if (p) return p; }
    if (recap.summary) { const p = tryExtractPartialBatchJson(recap.summary); if (p) return p; }
    return base;
  })();
  const isBatchComparison = Array.isArray(d.winners) || Array.isArray(d.losers) ||
    Array.isArray(d.similarities) || Array.isArray(d.differences) || Array.isArray(d.root_causes);

  if (!isBatchComparison) {
    return (
      <section className="card benchmark-recap-card">
        <div className="card-header">
          <div className="card-title"><BenchmarkRecapCardTitle /></div>
          <span className="text-xs muted benchmark-recap-card__timestamp">
            {formatRelative(recap.updated_at || recap.created_at)}
          </span>
        </div>
        <div className="benchmark-recap-card__body">{d.summary || recap.summary}</div>
        {Array.isArray(d.highlights) && d.highlights.length > 0 && (
          <ul className="benchmark-recap-card__highlights">
            {d.highlights.map((line, idx) => (
              <li key={idx} className="text-sm">{line}</li>
            ))}
          </ul>
        )}
      </section>
    );
  }

  const winners = Array.isArray(d.winners) ? d.winners : [];
  const losers = Array.isArray(d.losers) ? d.losers : [];
  const similarities = Array.isArray(d.similarities) ? d.similarities : [];
  const differences = Array.isArray(d.differences) ? d.differences : [];
  const rootCauses = Array.isArray(d.root_causes) ? d.root_causes : [];

  return (
    <section className="card benchmark-recap-card benchmark-recap-card--rich">
      <div className="card-header">
        <div className="card-title"><BenchmarkRecapCardTitle /></div>
        <span className="text-xs muted benchmark-recap-card__timestamp">
          {formatRelative(recap.updated_at || recap.created_at)}
        </span>
      </div>

      {(d.summary || recap.summary) && (
        <div className="benchmark-recap-section">
          <div className="benchmark-recap-section__label">Summary</div>
          <p className="benchmark-recap-section__text">{d.summary || recap.summary}</p>
        </div>
      )}

      {winners.length > 0 && (
        <div className="benchmark-recap-section">
          <div className="benchmark-recap-section__label benchmark-recap-section__label--winners">
            Winners
          </div>
          <ul className="benchmark-recap-winlose-list">
            {winners.map((w, idx) => (
              <li key={idx} className="benchmark-recap-winlose-item benchmark-recap-winlose-item--win">
                <span className="benchmark-recap-winlose-item__dot" />
                <span className="benchmark-recap-winlose-item__text">{w.why || w.run_id}</span>
              </li>
            ))}
          </ul>
        </div>
      )}

      {losers.length > 0 && (
        <div className="benchmark-recap-section">
          <div className="benchmark-recap-section__label benchmark-recap-section__label--losers">
            Losers
          </div>
          <ul className="benchmark-recap-winlose-list">
            {losers.map((l, idx) => (
              <li key={idx} className="benchmark-recap-winlose-item benchmark-recap-winlose-item--lose">
                <span className="benchmark-recap-winlose-item__dot" />
                <span className="benchmark-recap-winlose-item__text">{l.why || l.run_id}</span>
              </li>
            ))}
          </ul>
        </div>
      )}

      {(similarities.length > 0 || differences.length > 0) && (
        <div className="benchmark-recap-section">
          <div className="benchmark-recap-simdiff-tabs">
            <button
              className={`benchmark-recap-simdiff-tab${simDiffTab === 'similarities' ? ' benchmark-recap-simdiff-tab--active' : ''}`}
              onClick={() => setSimDiffTab('similarities')}
            >
              Similarities
              {similarities.length > 0 && (
                <span className="benchmark-recap-simdiff-tab__badge">{similarities.length}</span>
              )}
            </button>
            <button
              className={`benchmark-recap-simdiff-tab${simDiffTab === 'differences' ? ' benchmark-recap-simdiff-tab--active' : ''}`}
              onClick={() => setSimDiffTab('differences')}
            >
              Differences
              {differences.length > 0 && (
                <span className="benchmark-recap-simdiff-tab__badge">{differences.length}</span>
              )}
            </button>
          </div>
          {simDiffTab === 'similarities' && similarities.length > 0 && (
            <ul className="benchmark-recap-list">
              {similarities.map((s, idx) => <li key={idx} className="text-sm">{s}</li>)}
            </ul>
          )}
          {simDiffTab === 'differences' && differences.length > 0 && (
            <ul className="benchmark-recap-list">
              {differences.map((s, idx) => <li key={idx} className="text-sm">{s}</li>)}
            </ul>
          )}
          {simDiffTab === 'similarities' && similarities.length === 0 && (
            <p className="benchmark-recap-section__empty text-sm muted">No similarities recorded.</p>
          )}
          {simDiffTab === 'differences' && differences.length === 0 && (
            <p className="benchmark-recap-section__empty text-sm muted">No differences recorded.</p>
          )}
        </div>
      )}

      {rootCauses.length > 0 && (
        <div className="benchmark-recap-section">
          <div className="benchmark-recap-section__label">Root Causes</div>
          <ul className="benchmark-recap-list">
            {rootCauses.map((rc, idx) => <li key={idx} className="text-sm">{rc}</li>)}
          </ul>
        </div>
      )}
    </section>
  );
}

// ---------------------------------------------------------------------------
// Aggregate leaderboard table (workflow detail)
// ---------------------------------------------------------------------------

function BenchmarkLeaderboardTable({ rows, loading, emptyHint }) {
  const [sortKey, setSortKey] = useStateB(DEFAULT_LEADERBOARD_SORT.key);
  const [sortDir, setSortDir] = useStateB(DEFAULT_LEADERBOARD_SORT.dir);
  const [showByModel, setShowByModel] = useStateB(false);
  const sortIsDefault = isDefaultSort(sortKey, sortDir, DEFAULT_LEADERBOARD_SORT);

  const aggregatedRows = useMemoB(() => aggregateLeaderboardBySource(rows), [rows]);
  const baseRows = showByModel ? rows : aggregatedRows;
  const sortedRows = useMemoB(
    () => sortBenchmarkLeaderboard(baseRows, sortKey, sortDir),
    [baseRows, sortKey, sortDir],
  );

  if (loading) {
    return (
      <div className="benchmark-loader benchmark-loader--small"><LoadingSpinner /></div>
    );
  }

  if (!rows || rows.length === 0) {
    return (
      <div className="card benchmark-empty-card benchmark-leaderboard-empty">
        <div className="text-sm muted">{emptyHint || 'No data yet.'}</div>
      </div>
    );
  }

  function setSort(key) {
    if (sortKey === key) {
      setSortDir((d) => (d === 'desc' ? 'asc' : 'desc'));
      return;
    }
    setSortKey(key);
    setSortDir(BENCHMARK_DESC_DEFAULT_KEYS.has(key) ? 'desc' : 'asc');
  }

  function resetSort() {
    setSortKey(DEFAULT_LEADERBOARD_SORT.key);
    setSortDir(DEFAULT_LEADERBOARD_SORT.dir);
  }

  function toggleShowByModel() {
    setShowByModel((prev) => {
      // If hiding model breakdown while sorted by model, reset to default
      if (prev && sortKey === 'model') {
        setSortKey(DEFAULT_LEADERBOARD_SORT.key);
        setSortDir(DEFAULT_LEADERBOARD_SORT.dir);
      }
      return !prev;
    });
  }

  return (
    <div className="card benchmark-table-wrap">
      <div className="benchmark-table-toolbar benchmark-table-toolbar--leaderboard">
        {!sortIsDefault && (
          <button
            type="button"
            className="link-btn benchmark-table-toolbar__reset"
            onClick={resetSort}>
            Reset sort
          </button>
        )}
        <button
          type="button"
          className={`link-btn benchmark-leaderboard-model-toggle${showByModel ? ' is-active' : ''}`}
          onClick={toggleShowByModel}>
          {showByModel ? 'Hide model breakdown' : 'Breakdown by model'}
        </button>
      </div>
      <table className="ui-table benchmark-leaderboard-table">
        <thead>
          <tr>
            <th className="benchmark-table-row-num">#</th>
            <BenchmarkSortHeader label="Source" sortKey="source" current={sortKey} dir={sortDir} onSort={setSort} />
            {showByModel && (
              <BenchmarkSortHeader label="Harness" sortKey="model" current={sortKey} dir={sortDir} onSort={setSort} />
            )}
            <BenchmarkSortHeader label="Avg score" sortKey="avg_score" current={sortKey} dir={sortDir} onSort={setSort} />
            <BenchmarkSortHeader label="Avg duration" sortKey="avg_duration_ms" current={sortKey} dir={sortDir} onSort={setSort} />
            <BenchmarkSortHeader label="Avg tools" sortKey="avg_tool_calls" current={sortKey} dir={sortDir} onSort={setSort} />
            <BenchmarkSortHeader label="Avg in tokens" sortKey="avg_input_tokens" current={sortKey} dir={sortDir} onSort={setSort} />
            <BenchmarkSortHeader label="Avg out tokens" sortKey="avg_output_tokens" current={sortKey} dir={sortDir} onSort={setSort} />
            <BenchmarkSortHeader label="Runs" sortKey="runs" current={sortKey} dir={sortDir} onSort={setSort} />
          </tr>
        </thead>
        <tbody>
          {sortedRows.map((row, idx) => (
            <tr key={showByModel ? `${row.mcp_server_id ?? 'unknown'}-${row.tester_harness ?? row.tester_sdk_key ?? 'unknown'}-${row.tester_model_id ?? 'unknown'}` : (row.mcp_server_id || row.mcp_server_name || 'unknown')}>
              <td>{idx + 1}</td>
              <td>{row.mcp_server_name || '—'}</td>
              {showByModel && (
                <td>
                  <BenchmarkHarnessCell
                    harnessKey={row.tester_harness}
                    sdkKey={row.tester_sdk_key}
                    sdkName={row.tester_sdk_display_name}
                    modelName={row.tester_model_name} />
                </td>
              )}
              <td>
                <BenchmarkScoreChip score={row.avg_score != null ? Math.round(row.avg_score * 10) / 10 : null} />
              </td>
              <td>{row.avg_duration_ms != null ? formatDuration(row.avg_duration_ms) : '—'}</td>
              <td>{row.avg_tool_calls != null ? row.avg_tool_calls.toFixed(1) : '—'}</td>
              <td>{row.avg_input_tokens != null ? formatNumber(Math.round(row.avg_input_tokens)) : '—'}</td>
              <td>{row.avg_output_tokens != null ? formatNumber(Math.round(row.avg_output_tokens)) : '—'}</td>
              <td>{row.runs}</td>
            </tr>
          ))}
        </tbody>
      </table>
    </div>
  );
}

// ---------------------------------------------------------------------------
// Sort + helper utilities
// ---------------------------------------------------------------------------

// Columns whose natural starting direction is descending — when the user
// clicks one of these for the first time we go big→small, instead of the
// usual a→z ascending.
const BENCHMARK_DESC_DEFAULT_KEYS = new Set([
  'score',
  'avg_score',
  'started',
  'cells',
  'duration',
  'tools',
  'input_tokens',
  'output_tokens',
  'avg_duration_ms',
  'avg_tool_calls',
  'avg_input_tokens',
  'avg_output_tokens',
  'runs',
]);

// Each table has a "natural" default sort that the page lands on (and the
// "Reset sort" link returns to). Keeping these as objects makes the
// "is-default?" check and the reset action one-liners.
const DEFAULT_CELLS_SORT = { key: 'score', dir: 'desc' };
const DEFAULT_LEADERBOARD_SORT = { key: 'avg_score', dir: 'desc' };
const DEFAULT_BATCHES_SORT = { key: 'started', dir: 'desc' };

function isDefaultSort(key, dir, defaults) {
  return key === defaults.key && dir === defaults.dir;
}

function sortBenchmarkRuns(runs, key, dir) {
  // The default "Score DESC" sort folds in a duration ASC tiebreaker so
  // ties resolve to "fastest first" — matches the original spec.
  if (isDefaultSort(key, dir, DEFAULT_CELLS_SORT)) {
    return [...runs].sort((a, b) => {
      const sa = a.evaluation_score, sb = b.evaluation_score;
      if (sa == null && sb == null) return (a.duration_ms || 0) - (b.duration_ms || 0);
      if (sa == null) return 1;
      if (sb == null) return -1;
      if (sb !== sa) return sb - sa;
      return (a.duration_ms || 0) - (b.duration_ms || 0);
    });
  }
  const accessor = (run) => {
    switch (key) {
      case 'source': return (run.mcp_server_name || '').toLowerCase();
      case 'model': return benchmarkHarnessSortValue(run);
      case 'score': return run.evaluation_score;
      case 'duration': return run.duration_ms;
      case 'tools': return run.tool_call_count;
      case 'input_tokens': return run.input_tokens;
      case 'output_tokens': return run.output_tokens;
      case 'status': return (run.status || '').toLowerCase();
      default: return null;
    }
  };
  const factor = dir === 'asc' ? 1 : -1;
  return [...runs].sort((a, b) => {
    const va = accessor(a), vb = accessor(b);
    if (va == null && vb == null) return 0;
    if (va == null) return 1;
    if (vb == null) return -1;
    if (typeof va === 'string') return factor * va.localeCompare(vb);
    return factor * (va - vb);
  });
}

// Collapse per-(source, model) rows into one row per source, computing
// runs-weighted averages so sources with more runs don't get under-counted.
function aggregateLeaderboardBySource(rows) {
  if (!rows || rows.length === 0) return [];
  const map = new Map();
  for (const row of rows) {
    const key = row.mcp_server_id || row.mcp_server_name || '';
    if (!map.has(key)) {
      map.set(key, {
        mcp_server_id: row.mcp_server_id,
        mcp_server_name: row.mcp_server_name,
        runs: 0,
        _score_sum: 0, _score_w: 0,
        _duration_sum: 0, _duration_w: 0,
        _tool_calls_sum: 0, _tool_calls_w: 0,
        _tokens_in_sum: 0, _tokens_in_w: 0,
        _tokens_out_sum: 0, _tokens_out_w: 0,
      });
    }
    const agg = map.get(key);
    const w = row.runs || 1;
    agg.runs += w;
    if (row.avg_score != null) {
      agg._score_sum += row.avg_score * w;
      agg._score_w += w;
    }
    if (row.avg_duration_ms != null) {
      agg._duration_sum += row.avg_duration_ms * w;
      agg._duration_w += w;
    }
    if (row.avg_tool_calls != null) {
      agg._tool_calls_sum += row.avg_tool_calls * w;
      agg._tool_calls_w += w;
    }
    if (row.avg_input_tokens != null) {
      agg._tokens_in_sum += row.avg_input_tokens * w;
      agg._tokens_in_w += w;
    }
    if (row.avg_output_tokens != null) {
      agg._tokens_out_sum += row.avg_output_tokens * w;
      agg._tokens_out_w += w;
    }
  }
  return Array.from(map.values()).map((agg) => ({
    mcp_server_id: agg.mcp_server_id,
    mcp_server_name: agg.mcp_server_name,
    runs: agg.runs,
    avg_score: agg._score_w > 0 ? agg._score_sum / agg._score_w : null,
    avg_duration_ms: agg._duration_w > 0 ? agg._duration_sum / agg._duration_w : null,
    avg_tool_calls: agg._tool_calls_w > 0 ? agg._tool_calls_sum / agg._tool_calls_w : null,
    avg_input_tokens: agg._tokens_in_w > 0 ? agg._tokens_in_sum / agg._tokens_in_w : null,
    avg_output_tokens: agg._tokens_out_w > 0 ? agg._tokens_out_sum / agg._tokens_out_w : null,
  }));
}

function sortBenchmarkLeaderboard(rows, key, dir) {
  if (!rows) return [];
  // Default "avg_score DESC" folds in avg_duration_ms ASC as tiebreaker so
  // equal-score rows still order by "fastest first".
  if (isDefaultSort(key, dir, DEFAULT_LEADERBOARD_SORT)) {
    return [...rows].sort((a, b) => {
      const sa = a.avg_score, sb = b.avg_score;
      if (sa == null && sb == null) return (a.avg_duration_ms || 0) - (b.avg_duration_ms || 0);
      if (sa == null) return 1;
      if (sb == null) return -1;
      if (sb !== sa) return sb - sa;
      return (a.avg_duration_ms || 0) - (b.avg_duration_ms || 0);
    });
  }
  const accessor = (row) => {
    switch (key) {
      case 'source': return (row.mcp_server_name || '').toLowerCase();
      case 'model': return benchmarkHarnessSortValue(row);
      default: return row[key];
    }
  };
  const factor = dir === 'asc' ? 1 : -1;
  return [...rows].sort((a, b) => {
    const va = accessor(a), vb = accessor(b);
    if (va == null && vb == null) return 0;
    if (va == null) return 1;
    if (vb == null) return -1;
    if (typeof va === 'string') return factor * va.localeCompare(vb);
    return factor * (va - vb);
  });
}

function benchmarkHarnessSortValue(row) {
  const harnessKey = normalizeHarnessKey(row?.tester_harness);
  const sdkKey = normalizeHarnessKey(row?.tester_sdk_key);
  const logo = harnessKey ? BENCHMARK_HARNESS_LOGOS[harnessKey] : sdkKey ? BENCHMARK_SDK_LOGOS[sdkKey] : null;
  return `${logo?.label || row?.tester_sdk_display_name || row?.tester_harness || row?.tester_sdk_key || ''} ${row?.tester_model_name || ''}`.toLowerCase();
}

function sortBenchmarkBatches(rows, key, dir) {
  if (!rows || rows.length === 0) return [];
  const factor = dir === 'asc' ? 1 : -1;
  const accessor = (row) => {
    switch (key) {
      case 'batch': return (row.id || '').toLowerCase();
      case 'status': return (row.status || '').toLowerCase();
      case 'cells': return Number(row.completed_runs || 0);
      case 'started': {
        const ts = row.started_at || row.created_at;
        return ts ? new Date(ts).getTime() : null;
      }
      case 'duration': {
        if (row.completed_at && row.started_at) {
          return new Date(row.completed_at).getTime() - new Date(row.started_at).getTime();
        }
        return null;
      }
      default: return null;
    }
  };
  return [...rows].sort((a, b) => {
    const va = accessor(a), vb = accessor(b);
    if (va == null && vb == null) return 0;
    if (va == null) return 1;
    if (vb == null) return -1;
    if (typeof va === 'string') return factor * va.localeCompare(vb);
    return factor * (va - vb);
  });
}

function pickFixedEvaluatorModelId(models) {
  // Mirror getFixedEvaluatorModelId from pages-workflows.jsx without
  // hard-depending on it being exported. We pick the cheapest active OpenAI
  // model so judge cost stays predictable across benchmarks.
  const openAi = (models || []).filter((m) => m.is_active && m.provider_key === 'openai');
  if (openAi.length === 0) return '';
  const ranked = [...openAi].sort((a, b) => {
    const rank = (m) => {
      const s = `${m.model_key || ''} ${m.provider_model_name || ''} ${m.display_name || ''}`.toLowerCase();
      if (s.includes('gpt-5-mini')) return 0;
      if (s.includes('gpt-5-nano')) return 1;
      if (s.includes('mini')) return 2;
      if (s.includes('nano')) return 3;
      return 4;
    };
    return rank(a) - rank(b);
  });
  return ranked[0]?.id || '';
}

function pickDefaultBenchmarkTesterTargets(models) {
  // Pick one default target per active provider so a fresh benchmark covers
  // both Anthropic and OpenAI out of the box (aligns with the production
  // workflow defaults users are already familiar with).
  const out = [];
  const seenSdk = new Set();
  const sortedModels = [...(models || [])].filter((m) => m.is_active);
  for (const provider of ['openai', 'anthropic']) {
    const candidate = sortedModels.find((m) => m.provider_key === provider);
    if (candidate) {
      const harness = harnessKeyForModel(candidate);
      if (!seenSdk.has(`${harness}::${candidate.id}`)) {
        out.push({ harness, modelId: candidate.id });
        seenSdk.add(`${harness}::${candidate.id}`);
      }
    }
  }
  if (out.length === 0 && sortedModels[0]) {
    out.push({ harness: harnessKeyForModel(sortedModels[0]), modelId: sortedModels[0].id });
  }
  return out;
}

function harnessKeyForModel(model) {
  if (!model) return 'claude_code';
  if (model.sdk_key === 'codex') return 'codex';
  if (model.sdk_key === 'claude') return 'claude_code';
  if (model.sdk_key === 'gemini') return 'gemini';
  if (model.sdk_key === 'cursor') return 'cursor';
  if (model.sdk_key === 'openclaw') return 'openclaw';
  if (model.sdk_key === 'opencode') return 'opencode';
  if (model.sdk_key === 'anthropic_api') return 'claude';
  if (model.sdk_key === 'openai_api') return 'chatgpt';
  if (model.provider_key === 'openai') return 'codex';
  if (model.provider_key === 'google') return 'gemini';
  if (model.provider_key === 'cursor') return 'cursor';
  if (model.provider_key === 'opencode') return 'opencode';
  return 'claude_code';
}

// ---------------------------------------------------------------------------
// Window exports (loaded by app.jsx via the global registry pattern)
// ---------------------------------------------------------------------------

window.BenchmarkPage = BenchmarkPage;