% !TEX TS-program = pdflatex
% !TEX encoding = UTF-8 Unicode
%
% CaP-X paper, single self-contained source.
% Compile: pdflatex -> bibtex (none, refs are inline) -> pdflatex x2
%
\documentclass[conference,letterpaper,10pt]{IEEEtran}
\IEEEoverridecommandlockouts

\usepackage{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{subcaption}
\usepackage{tikz}
\usepackage{xcolor}
\usepackage{url}
\usepackage{hyperref}
\hypersetup{
  colorlinks=true,
  linkcolor=black,
  citecolor=black,
  urlcolor=blue!60!black
}

\usetikzlibrary{shapes.geometric, arrows.meta, positioning, fit, calc}

\graphicspath{{figures/}}

% A lightweight \todo macro (the user will run pdflatex; we keep it harmless).
\newcommand{\todo}[1]{\textcolor{red}{\textbf{[TODO:}\,#1\textbf{]}}}

% Custom small caps for condition labels.
\newcommand{\cond}[1]{\textsc{#1}}

\title{When Auto-Mined Robot Skills Help:\\A CaP-X-Style Ablation of Executable Skill Libraries}

\author{
\IEEEauthorblockN{realkim93}
\IEEEauthorblockA{
capx (independent), Seoul, Republic of Korea\\
CaP-X Team\\
Correspondence: \texttt{pheonix37@naver.com}\\[2pt]
\textit{AI assistance: Anthropic Claude (Opus 4.7, 1M context) was used as a coding and writing assistant; see Acknowledgements.}
}
}

\begin{document}

\maketitle

\begin{abstract}
We test whether an LLM-controlled robot code-generation agent benefits from executable skills mined from its own prior programs. Starting from the NVIDIA CaP-X setting, we add a \emph{capx} skill-memory layer to a Code-as-Policies-style \texttt{cube\_lifting} pipeline: generated Python functions are extracted by AST, filtered by quality gates, optionally deduplicated, and reinjected into later prompts and execution namespaces. The north-star hypothesis is that executable mined skills should improve later robot-code generation beyond a no-skills baseline.

The strongest result is conditional. With \texttt{gpt-4.1}, the no-skill baseline (\cond{P21\_a}) reaches $39/50 = 78.0\%$ (Wilson 95\% CI $[64.8\%, 87.2\%]$). Two no-dedup mined-skill libraries reach $67/70 = 95.7\%$ for the namespace-enabled 16-skill arm (\cond{C1}, CI $[88.1\%, 98.5\%]$) and $68/70 = 97.1\%$ for the gated 14-skill arm (\cond{C2}, CI $[90.2\%, 99.2\%]$), separating from the baseline by roughly $+18$--$19$pp. A typed empty-stub control (\cond{empty\_ns}) reaches only $13/30 = 43.3\%$, showing that executable function bodies, not merely a typed namespace, drive the effect.

The production-style library does not preserve this gain. The structural-hash deduplicated library (\cond{C3v2}) reaches $58/70 = 82.9\%$, statistically indistinguishable from the no-skill baseline and $-14$pp below the no-dedup gated library. A quality-ranked same-size 11-skill library (\cond{manual\_11}), formed by dropping the three lowest-quality \cond{C2} skills, reaches $66/70 = 94.3\%$, suggesting that the loss is caused by survivor selection rather than by the number of skills. Docstring controlled tests further show that LLMs strongly prefer documented variants when task-fit-equivalent functions compete.

Thus v1 supports a bounded claim: auto-mined executable skills improve this CaP-X-derived \texttt{cube\_lifting} pipeline under no-dedup selection, but the current dedup survivor rule can erase the benefit.

A v2 follow-up resolves the two largest open boundaries with two real-run additions ($n{=}50$ each on Claude/DeepSeek baselines + library, $n{=}70$ each on three Dedup~v3 cutoffs; we report the capx-defined ``Task completed'' rate -- the trial's final attempt being a success -- so v1 and v2 numbers are directly comparable). \emph{Multi-backbone library effect, fully closed:} on Claude Sonnet~4 the library effect is $+12$pp ($98.0\%$ [89.5, 99.6] vs $86.0\%$ [73.8, 93.0]; CIs overlap by 3.2pp at the boundary, but the one-sided binomial test $P[X{\ge}49\mid n{=}50, p{=}0.86]\approx 0.5\%$); on DeepSeek~v3 it is $+88.0$pp ($94.0\%$ [83.8, 97.9] vs $6.0\%$ [2.1, 16.2]; CIs fully separated, $P\approx 10^{-54}$). The DeepSeek finding is the largest single-task library benefit observed in this project and demonstrates that library magnitude is baseline-dependent (the weakest baseline benefits most). \emph{Dedup~v3 algorithmic robustness:} top-$k$-by-quality\_score at $n{=}70$ each reaches $91.4\%/87.1\%/97.1\%$ for $k\in\{10,12,13\}$. Combined with $k{=}11$ ($94.3\%$ from paper~v1's manual-11 arm), $k{=}11$ and $k{=}13$ are statistically separated from the production structural-dedup library C3v2 ($82.9\%$, one-sided $P{=}10^{-3}$), $k{=}10$ is marginally above ($P{=}0.034$), and $k{=}12$ is not separated ($P{=}0.22$). The smart dedup is therefore an algorithmic recipe with non-monotone $k$-sensitivity, not a uniformly robust function. Three additional ``smoke'' micro-evaluations (Section~\ref{sec:smoke-phase2}) probe the boundaries of these claims: (i)~a new \texttt{cube\_stack\_3} task floors at $0/15$ for both no-skills and library arms, with the bottleneck localised to a vision-pipeline mask-fragmentation regime that is task-specific rather than library-related, and the library still confers a $24$--$34\%$ \emph{code-efficiency} reduction even at the floor; (ii)~a round-$2$ mining attempt on a mid-range library returns $0$ promotable new skills, evidencing a ``namespace saturation'' where dense libraries induce purely imperative LLM code with no further function-level abstractions to extract; (iii)~a verbose mechanism trace ($n{=}3$ each on Claude / DeepSeek) suggests the multi-backbone library asymmetry tracks self-correction ability rather than reasoning ability -- both backbones make the same first-attempt errors but diverge in regeneration convergence.
\end{abstract}

\begin{IEEEkeywords}
LLM agents, code-as-policies, skill libraries, ablation study, robot manipulation, CaP-X.
\end{IEEEkeywords}

\section{Introduction}
\label{sec:intro}

NVIDIA CaP-X~\cite{capx2026} studies coding agents for robot manipulation: an LLM receives a small robot API, emits executable Python, runs the code in simulation, and receives visual feedback. This paper keeps that observable code-generation loop but asks a narrower question: can the agent turn its own past robot-control programs into a reusable executable skill library that improves future trials?

The proposed extension, \emph{capx}, mines named Python functions from previous generated code, filters them by execution and code-quality signals, and reinjects promoted functions into both the prompt and the execution namespace. The north-star hypothesis is simple: if the mined functions encode useful robot-control subroutines, later trials should outperform a no-skills CaP-X-style baseline. The scientific risk is equally simple: apparent gains may be caused by harness fixes, lucky samples, or survivor-selection artifacts rather than by reusable skill content.

We evaluate this hypothesis on \texttt{cube\_lifting}. The claim ladder for v1 is:

\begin{enumerate}
    \item \textbf{Measured within-task gain.} The no-dedup mined-skill library beats the no-skill baseline (\cond{P21\_a}) on \texttt{cube\_lifting} by roughly $+18$--$19$pp with disjoint Wilson intervals.
    \item \textbf{Skill bodies matter.} A typed-but-empty namespace control performs far below both baseline and real-skill conditions, so the gain is not just scaffolding.
    \item \textbf{Survivor selection matters.} The structural-hash deduplicated production library falls back toward baseline, while a same-size quality-ranked manual subset (\cond{manual\_11}) recovers most of the no-dedup performance.
    \item \textbf{Transfer is not yet measured.} \texttt{cube\_stack} and LIBERO floor under the present harness; Claude/DeepSeek \cond{C2} endpoints are high but lack matched no-skills baselines.
\end{enumerate}

The paper's contribution is therefore not a broad claim that mined robot skills generally solve manipulation tasks. It is a bounded ablation result: executable mined skills can help a CaP-X-derived \texttt{cube\_lifting} agent, but current structural dedup can erase the effect, and transfer requires a better measurement regime.

Our reported contributions are:

\begin{enumerate}
    \item A CaP-X-derived executable skill-mining pipeline for robot code generation, including AST extraction, quality gates, deduplication, and prompt/runtime reinjection.
    \item A controlled \texttt{cube\_lifting} ablation showing a no-dedup skill-library win over a no-skills baseline, plus an empty-stub control isolating executable body content.
    \item Evidence that the present dedup survivor rule is the main failure point: the production structural-dedup library (\cond{C3v2}) loses the no-dedup gated-library (\cond{C2}) gain, while the quality-ranked same-size library (\cond{manual\_11}) recovers it at the same library size.
    \item Docstring and decoy probes explaining why survivor choice affects LLM invocation.
    \item Negative transfer results that delimit v1 and define the next experiment: a medium-difficulty task with neither ceiling nor floor.
\end{enumerate}

\section{Related Work}
\label{sec:related}

\textbf{Code-as-policies for robots.} CaP-X~\cite{capx2026} is a recent baseline in which a small base API plus VDM feedback drives an LLM to produce executable robot code. Our pipeline begins from CaP-X and inserts an auto-skill discovery stage. The original Code-as-Policies paper~\cite{liang2023code} did not include a learned library.

\textbf{Agent harnesses with cumulative skills.} Voyager~\cite{voyager} pioneered the idea of an LLM agent that grows a skill library over a long horizon in Minecraft. SWE-agent~\cite{sweagent} and OpenHands~\cite{openhands} apply similar ideas to software engineering. We borrow the \emph{plumbing} of skill mining from this lineage but examine its behaviour on a robot task with a much harder physics-grounded reward.

\textbf{Skill libraries for robots.} Prior work on robot skill libraries typically curates skills manually or uses RL to learn options. Our setting is closer to neural program induction: the LLM emits Python, we run AST analysis, we promote candidates that pass quality gates, and we feed the promoted set back into the next prompt.

\textbf{Methodology.} Our work also touches on ablation methodology for agent systems: when a high-performing run follows several simultaneous harness changes, each causal explanation must be treated as a hypothesis until isolated by controls.

\section{Method}
\label{sec:method}

\subsection{capx pipeline}
\label{sec:method-pipeline}

Figure~\ref{fig:arch} sketches the \emph{capx} pipeline. A single \texttt{cube\_lifting} trial proceeds as: (i) the LLM is shown the base APIs and the current promoted skill library; (ii) the LLM emits a Python code block; (iii) the block is executed in the robosuite environment~\cite{robosuite2020}; (iv) the VDM (\texttt{gemini-3.1-pro-preview}) decides whether the block succeeded, and if not, the LLM is asked to regenerate; (v) when the trial finishes, all named functions emitted across the trial are AST-walked and added to a candidate pool; (vi) candidates are scored by quality gates and structurally deduplicated, then promoted into the library that the next trial will see.

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-skill-library-architecture.pdf}
\caption{capx skill mining and re-injection pipeline. AST-extracted candidates flow through soft gates (success rate, generality, code quality), hard-fail gates (resolvable deps, non-noop, vision-server), and dedup before being injected into the next trial's prompt. Figure regenerated for paper; see Section~\ref{sec:method-pipeline}.}
\label{fig:arch}
\end{figure}

\textbf{Quality gates.} The gate stack has three soft gates -- minimum success rate, source-task generality (a skill must occur in $\geq 2$ tasks), and code quality (docstring + complexity + AST validity) -- and three hard-fail gates -- resolvable dependencies, non-noop body, and vision-server availability. A skill must pass an overall threshold of $0.5$ on the soft side and not trigger any hard-fail to be eligible for promotion. Phase~1.4 + 1.8 (Section~\ref{sec:exp-mechanical}) measures which gates actually fire on our pool.

\textbf{Dedup.} The original dedup keys each candidate by a structural signature \((\textsc{arg\_count}, \textsc{has\_return}, \mathit{frozenset}(\textsc{called\_names}))\). Within a cluster sharing a key, the original survivor rule kept the lexicographically first variant. Dedup~v2 changes the survivor rule to \((\textsc{has\_docstring}, \textsc{success\_rate}, \textsc{name})\), prioritising docstring-bearing variants.

\textbf{Namespace seeding.} The fix that turned out to dominate (Section~\ref{sec:exp-ablation}) is at execution time, not at extraction time: when the prompt's skill library is exec-compiled, we seed the namespace with the base-API callables, the sibling skills, and a small set of pure-numpy utility functions that some skills depend on (e.g.\ rotation-matrix conversions). Without this, a promoted skill that calls \texttt{solve\_ik(...)} hits a \texttt{NameError}.

\subsection{Ablation design}
\label{sec:method-ablation}

Five conditions, each with the same environment (seeds 1--15 for \cond{C0}--\cond{C3}; seeds 1--10 for \cond{P21\_a}), \texttt{num\_workers}=1, the same VDM, the same base API, and the same model (\texttt{openrouter/openai/gpt-4.1}). Only the skill set and the namespace-seeding toggle vary. For readability, each arm is named by its role first and its internal run ID second:

\begin{itemize}
\item \textbf{Broken-namespace diagnostic} (\cond{C0}): pre-Group-D 16 skills, namespace seeding \emph{off} -- intended to reproduce the Group C bug mechanism.
\item \textbf{Namespace-enabled no-dedup library} (\cond{C1}): same 16 skills, namespace seeding \emph{on} -- both a namespace-seeding diagnostic and a powered high-performing no-dedup arm.
\item \textbf{No-dedup gated mined-skill library} (\cond{C2}): 14 skills after applying the new gate stack, namespace seeding on.
\item \textbf{Original structural-dedup diagnostic} (\cond{C3}): 11 skills after applying the new gates and the original signature-based dedup; this is the historical Group D library.
\item \textbf{Production structural-dedup library} (\cond{C3v2}): same 11 skills count but with Dedup~v2 (docstring-aware survivor).
\item \textbf{No-skill baseline} (\cond{P21\_a}): \emph{no skills} (\texttt{FrankaControlApiReduced}); 10 trials on seeds 1--10.
\end{itemize}

For \cond{C3v2}, we additionally re-ran $n{=}50$ to estimate the true mean and 95\% Wilson confidence interval (Section~\ref{sec:exp-c3v2-n50}).

\subsection{Held-out transfer protocol}
\label{sec:method-heldout}

For \texttt{cube\_stack} (Phase~2.4c, 2.4d, Section~\ref{sec:exp-heldout}) we ran four conditions ($p22\_a$, $C2$, $C3$, $C3v2$) at $n{=}10$--$12$ each. The pipeline reuses the cube\_lifting library (no new extraction), making this a held-out \emph{transfer} test of the same skill set onto a structurally related but harder task (stacking requires lifting + placing). For LIBERO~\cite{libero2023} (Phase~2.5 and May~3 follow-ups, Section~\ref{sec:libero}) we used the same smoke-first policy: stop before full held-out transfer when the no-skills or privileged-smoke condition is already at floor.

\section{Experiments \& Results}
\label{sec:exp}

\subsection{From CaP-X baseline to a mined-skill testbed}
\label{sec:exp-groupd}

Table~\ref{tab:abcd} records the historical development conditions that motivated the controlled experiments. They are useful as provenance, but not as the paper's main evidence: Group A used different upstream conditions, Group D combined multiple changes, and the reported Group A ceiling did not reproduce. We therefore treat Groups A--D as a starting point for hypothesis generation and rely on the controlled conditions in Table~\ref{tab:ablation} for the claims.

\begin{table}[t]
\caption{Original Groups A--D on \texttt{cube\_lifting} ($n{=}50$). \emph{Group A's $98\%$ figure does not reproduce at our 5/3 measurement (\cond{group\_a\_repro} at $n{=}30$ reaches only $63.3\%$, see Sec.~\ref{sec:exp-c3v2-n50}); the historical figure was a lucky sample plus drifted upstream conditions.}}
\label{tab:abcd}
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{@{}p{0.23\columnwidth}rrp{0.34\columnwidth}@{}}
\toprule
Group & \# skills & Completed & Notes\\
\midrule
A (no skills) & 0 & 49/50 = 98\% & native API only (see caption) \\
B (manual 9) & 9 & $\sim$48/50 $\approx$ 96\% & hand-picked, partial logs \\
C (auto 16, buggy) & 16 & 8/50 = 16\% & broken namespace + weak gates \\
D (auto 11, fixed) & 11 & 49/50 = 98\% & ``3-fix'' applied \\
\bottomrule
\end{tabular}
\end{table}

The controlled comparison below isolates the skill-memory mechanism from harness repair: namespace seeding, quality gates, deduplication, empty stubs, and no-skills baselines are measured as separate conditions.

\subsection{How the mined library is used}
\label{sec:exp-mechanical}

We re-analysed the 50 Group D trials (89 retry directories with 227 code blocks) along nine zero-cost axes. Findings cluster into four themes.

\subsubsection{Skill usage matrix (Phase 1.1)}
Across 89 directories and 11 promoted skills, \texttt{execute\_grasp} is called in 100\% of directories ($170$ calls); the top four skills (\texttt{execute\_grasp}, \texttt{lift\_object}, \texttt{plan\_best\_grasp\_on\_mask}, \texttt{transform\_cam\_to\_world}) account for $566$ of $744 = 76\%$ of all calls. Two of the eleven promoted skills (\texttt{select\_top\_grasp}, \texttt{grasp\_pose\_to\_ik}) are \emph{dead}: zero calls anywhere (Figure~\ref{fig:matrix}). The distribution is top-heavy with a long tail; rare skills (\texttt{rotmat\_to\_quat\_wxyz}, \texttt{move\_to\_pose\_world}) appear mostly inside retry blocks.

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-skill-usage-matrix.pdf}
\caption{Skill usage matrix (89 trial directories $\times$ 11 promoted skills) from Phase~1.1. Cell intensity is call count. Two skills are entirely unused.}
\label{fig:matrix}
\end{figure}

\subsubsection{Unpromoted pool audit (Phase 1.2)}
The library has 11 promoted skills but 330 \emph{unpromoted} ones in the candidate pool, for a total of 341 ($\sim$50 unique by hand inspection). Of the unpromoted set, 91.2\% fail the generality gate (\texttt{occurrences}=1), 76.1\% fail the success-rate gate, and 0\% fail the complexity gate. \textbf{18 unpromoted skills share an exact structural hash with a promoted skill}; the most common cluster collapses 9+ name variants into \texttt{transform\_cam\_to\_world} (Figure~\ref{fig:unpromoted}). Manual inspection of three samples confirmed they are 1-line matrix multiplications differing only in argument names and docstring wording.

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-unpromoted-pool.pdf}
\caption{Bucketing of the 330 unpromoted candidates by gate-failure pattern (Phase~1.2). \emph{Genuine junk} is SR$<0.1$ and occ$=1$; \emph{near-miss} is SR$\geq 0.6$ + docstring + occ$=1$. 18 candidates are structural-hash duplicates of promoted skills, demonstrating the \emph{naming explosion}.}
\label{fig:unpromoted}
\end{figure}

\subsubsection{Survival features and gate sensitivity (Phase 1.4 + 1.8)}
The 16 pre-Group-D promoted skills all sit on the threshold (occurrences $=2$, source\_tasks $=2$, success rate $\in[0.33, 0.80]$, complexity $\leq 4$). A single feature does not separate kept from dropped. The five dropped skills all have signature-level twins among the kept set, suggesting that the dedup step (not the gates per se) caused the drop. A sweep over gate parameters shows that with the soft gates alone, $148/341$ candidates would be promoted; the actual count of $11$ tells us hard-fail gates plus dedup remove an additional $137$ candidates -- \textbf{93\% of the filter is hard-fail + dedup}.

\subsubsection{Prompt position and docstring effect (Phase 1.5)}
The 11 promoted skills appear in a fixed order in the prompt (positions 7580--8394 of an 8875-character template). Pearson correlation between position and call count is $-0.10$ (Spearman $-0.01$): \emph{position by itself shows no primacy bias}. Stratifying by docstring presence: skills with a docstring ($n{=}8$) average $88.9$ calls; skills without a docstring ($n{=}3$) average $11.0$ calls -- a ratio of about $8\times$. Both dead skills have no docstring.

\subsubsection{Invocation context (Phase 1.6)}
Across all 744 promoted-skill calls, positional arity is correct in 100\% of cases. Variable names tracked the implied semantics (\texttt{grasp\_pose}, \texttt{world\_T}, \texttt{mask}). The single docstring-less but \emph{used} skill, \texttt{get\_grasp\_pose\_for\_mask}, was called with arity 2 in 100\% of its 33 occurrences -- evidence that a self-explanatory name and signature can substitute for a docstring on the input side, even though docstring-presence is a strong predictor of \emph{being} called.

\subsubsection{Trial-order effects (Phase 1.7)}
First 25 trials versus last 25: $12.8$ vs $17.0$ mean calls, $4.12$ vs $5.00$ mean blocks, $25/25$ vs $24/25$ successes. Pearson and Spearman correlations of trial-index against any metric are all $|r| < 0.23$. The LLM is stateless across trials, so a trial-order effect would have been a coincidence; we did not find one.

\subsubsection{Failure event sequence (Phase 1.3)}
The single Group D failure (\texttt{trial\_42}, reward $0.521$) had 12 retry blocks. Three of those blocks executed without code-level errors yet were rejected by the VDM, which set \texttt{decision = regenerate} but left the \texttt{reasoning} field empty. Across all 89 directories, 228 of 277 blocks ($82\%$) had empty VDM reasoning. The root cause is a combination of (a) the prompt asking for a single keyword response and (b) \texttt{gpt-4.1} via OpenRouter not populating the \texttt{reasoning} field (Phase~0.3). This is a separate bug from skill-library behaviour and is left to a future patch.

\subsubsection{Retry behaviour: abstraction downgrade (Phase 1.9)}
Of 76 retry directories, we classify each retry's skill set against the initial block: \cond{SAME\_SET} ($25$), \cond{REPLACE} ($21$), \cond{EXPAND} ($21$), \cond{SHRINK} ($9$). Success rates differ:

\begin{itemize}
\item \cond{REPLACE}: $14/21 = 66.7\%$
\item \cond{EXPAND}: $12/21 = 57.1\%$
\item \cond{SHRINK}: $4/9 = 44.4\%$
\item \cond{SAME\_SET}: $11/25 = 44.0\%$
\end{itemize}

The skills most often \emph{added} on retry are low-level coordinate primitives (\texttt{move\_to\_pose\_world}, \texttt{pose\_matrix\_to\_pos\_quat}, \texttt{transform\_cam\_to\_world}); the skills most often \emph{removed} are high-level pipelines (\texttt{plan\_best\_grasp\_on\_mask}, \texttt{execute\_grasp}, \texttt{lift\_object}). The pattern, summarised in Figure~\ref{fig:retry}, is an \textbf{abstraction downgrade}: when the high-level skill seems to be misbehaving, the LLM peels it back into its primitives -- much as a human programmer would.

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-retry-abstraction-downgrade.pdf}
\caption{Phase~1.9 retry-classification success rates: changing the skill set (\cond{REPLACE}/\cond{EXPAND}) outperforms reusing it (\cond{SAME\_SET}/\cond{SHRINK}). The pattern is consistent with an abstraction-downgrade strategy.}
\label{fig:retry}
\end{figure}

\subsection{Does the mined library improve \texttt{cube\_lifting}?}
\label{sec:exp-ablation}

Table~\ref{tab:ablation} reports the headline ablation using descriptive arm labels. The broken-namespace diagnostic (\cond{C0}) reproduces the Group C bug mechanism at $5/15 = 33\%$. Turning namespace seeding on creates a high-performing no-dedup 16-skill arm (\cond{C1}). The gated no-dedup library (\cond{C2}) is the strongest powered skill condition. The original structural-dedup diagnostic (\cond{C3}) and the production structural-dedup library (\cond{C3v2}) show the survivor-selection tax. The no-skill baseline (\cond{P21\_a}), typed empty-stub control (\cond{empty\_ns}), and quality-ranked same-size library (\cond{manual\_11}) then anchor the claim ladder.

\begin{table}[t]
\caption{Main ablation table. Arm names are reader-facing labels; internal IDs in parentheses preserve reproducibility. The no-skill baseline, two no-dedup mined-skill libraries, production structural-dedup library, empty-stub control, quality-ranked same-size library, and backbone endpoint replications jointly define the v1 claim ladder. All CIs are 95\% Wilson intervals~\cite{wilson1927}.}
\label{tab:ablation}
\centering
\scriptsize
\setlength{\tabcolsep}{2.2pt}
\begin{tabular}{@{}p{0.34\columnwidth}rrll@{}}
\toprule
Experiment arm & $n$ & Success & Rate & 95\% CI \\
\midrule
Broken namespace (\cond{C0})      & 15 & 5  & $33.3\%$ & $[15.2\%, 58.3\%]$ \\
Namespace-enabled no-dedup (\cond{C1}) & 70 & 67 & $95.7\%$ & $[88.1\%, 98.5\%]$ \\
No-dedup gated skills (\cond{C2}) & 70 & 68 & $97.1\%$ & $[90.2\%, 99.2\%]$ \\
Original structural dedup (\cond{C3}) & 15 & 11 & $73.3\%$ & $[48.0\%, 89.1\%]$ \\
Production structural dedup (\cond{C3v2}) & 70 & 58 & $82.9\%$ & $[72.4\%, 89.9\%]$ \\
No-skill baseline (\cond{P21\_a}) & 50 & 39 & $78.0\%$ & $[64.8\%, 87.2\%]$ \\
\addlinespace
Typed empty stubs (\cond{empty\_ns}) & 30 & 13 & $43.3\%$ & $[27.4\%, 60.8\%]$ \\
Historical no-skill replay (\cond{group\_a\_repro}) & 30 & 19 & $63.3\%$ & $[45.5\%, 78.1\%]$ \\
Quality-ranked same-size (\cond{manual\_11}) & 70 & 66 & $94.3\%$ & $[86.2\%, 97.8\%]$ \\
Claude endpoint replication (\cond{C2}$\times$Claude) & 20 & 19 & $95.0\%$ & $[76.4\%, 99.1\%]$ \\
DeepSeek endpoint replication (\cond{C2}$\times$DeepSeek) & 20 & 20 & $100.0\%$ & $[83.9\%, 100.0\%]$ \\
\addlinespace
\multicolumn{5}{l}{\emph{v2 D2 closure (Claude / DeepSeek baselines + library, $n{=}50$)}} \\
\cond{P21\_a}$\times$Claude (no-skill, Claude) & 50 & 43 & $86.0\%$ & $[73.8\%, 93.0\%]$ \\
\cond{P21\_a}$\times$DeepSeek (no-skill, DeepSeek) & 50 & 3 & $6.0\%$ & $[2.1\%, 16.2\%]$ \\
\cond{manual\_11}$\times$Claude (library, Claude) & 50 & 49 & $98.0\%$ & $[89.5\%, 99.6\%]$ \\
\cond{manual\_11}$\times$DeepSeek (library, DeepSeek) & 50 & 47 & $94.0\%$ & $[83.8\%, 97.9\%]$ \\
\addlinespace
\multicolumn{5}{l}{\emph{v2 Dedup~v3 algorithmic robustness ($n{=}70$ each)}} \\
\cond{dedup\_v3\_k10} (top-10 by quality\_score) & 70 & 64 & $91.4\%$ & $[82.5\%, 96.0\%]$ \\
\cond{dedup\_v3\_k12} (top-12 by quality\_score) & 70 & 61 & $87.1\%$ & $[77.3\%, 93.1\%]$ \\
\cond{dedup\_v3\_k13} (top-13 by quality\_score) & 70 & 68 & $97.1\%$ & $[90.2\%, 99.2\%]$ \\
\bottomrule
\end{tabular}
\end{table}

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-ablation-bar.pdf}
\caption{Final success rate by experiment arm with 95\% Wilson CIs. The no-skill baseline is measured at $n{=}50$; the powered no-dedup, production-dedup, and quality-ranked same-size libraries are measured at $n{=}70$; empty-stub and historical replay controls are measured at $n{=}30$.}
\label{fig:ablationbar}
\end{figure}

The 3-fix retrospective rewrite is the headline of this paper. Figure~\ref{fig:rewrite} shows the per-fix $\Delta$pp from the initial $n{=}15$ ablation, with the boosted-$n$ magnitudes in parentheses where they differ:

\begin{itemize}
\item \cond{C0}\,$\to$\,\cond{C1} (namespace seeding): $\boldsymbol{+67}$\textbf{pp} at $n{=}15$ (i.e. $33\%\to100\%$). Boosted to $n{=}70$, \cond{C1} settles at $95.7\%$, so the cleaner per-fix $\Delta$ is $\boldsymbol{+62.4}$\textbf{pp}. This remains the dominant fix in either reading.
\item \cond{C1}\,$\to$\,\cond{C2} (quality gates): $\boldsymbol{+0}$\textbf{pp} -- gates trim the candidate pool but the visible difference at the promoted set is zero ($95.7\%\to97.1\%$ at $n{=}70$, CIs heavily overlapping).
\item \cond{C2}\,$\to$\,\cond{C3} (signature dedup v1): $\boldsymbol{-27}$\textbf{pp} at $n{=}15$ ($100\%\to73\%$). \cond{C3} was not boosted, so the magnitude carries small-$n$ uncertainty.
\item \cond{C3}\,$\to$\,\cond{C3v2} (Dedup v2, doc-prefer): $\boldsymbol{+20}$\textbf{pp} at $n{=}15$. Re-measured at $n{=}70$, the recovery is much smaller ($73\%\to82.9\%$, $\boldsymbol{+9.6}$\textbf{pp}), and \cond{C3v2} remains $\boldsymbol{-14.2}$\textbf{pp} below \cond{C2} at the same $n{=}70$ -- statistically separated (CI lower 90.2\% vs upper 89.9\%, narrow 0.3pp gap).
\end{itemize}

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-3fix-narrative-rewrite.pdf}
\caption{Per-stage $\Delta$pp on \texttt{cube\_lifting} after the boosted measurements. The 3-fix story is rewritten: namespace seeding is the dominant single fix, structural dedup is a large tax, and a quality-ranked 11-skill subset recovers most of the no-dedup performance.}
\label{fig:rewrite}
\end{figure}

The initial counter-factual \cond{P21\_a} run (no skills) at $9/10=90\%$ sat between \cond{C2} and \cond{C3}. The boosted measurement later settled at $39/50=78.0\%$. That updated baseline is below the no-dedup library (\cond{C2}, $97.1\%$) but statistically indistinguishable from the dedup-applied production library (\cond{C3v2}, $82.9\%$), which is why the dedup investigation became central rather than cosmetic.

\subsection{Why survivor selection matters}
\label{sec:exp-dedupv2}

Dedup v2 swaps a single skill in the promoted set: \texttt{get\_grasp\_pose\_for\_mask} (no docstring, SR $0.55$) is removed and \texttt{plan\_and\_select\_grasp} (docstring, SR $0.53$) is added. At $n{=}15$, \cond{C3v2} reaches $14/15 = 93\%$, $+20$pp over \cond{C3}. The other dedup-clusters were unchanged.

The mechanism is consistent with Phase~1.5: docstring-bearing skills are called $\sim 8\times$ more often. The original dedup, by ignoring docstring presence in its survivor rule, removed Phase~1.5's strongest predictor from the promoted variant. Dedup~v2 makes the survivor rule \((\textsc{has\_docstring}, \textsc{success\_rate}, \textsc{name})\) and recovers the variant the LLM is more likely to actually call.

\subsection{Variance check at $n=50$ and $n=70$}
\label{sec:exp-c3v2-n50}

A 50-trial re-measurement of \cond{C3v2} produces $40/50 = 80\%$, 95\% Wilson CI $[67\%, 89\%]$ -- a $13$pp drop from the $n{=}15$ point estimate. Of the seeds that were successful at $n{=}15$, six (\{1, 7, 9, 12, 13, 15\}) failed at least once at $n{=}50$. Average regenerations rose from $0.13$ to $2.21$ and average code blocks from $0.87$ to $3.21$. The 95\% CI of \cond{C3v2} ($n{=}50$) overlaps heavily with that of \cond{C3} ($n{=}15$), so we revise the Dedup v2 effect down: \emph{$+5$ to $+10$pp, not $+20$pp}.

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-c3v2-n50-wilson.pdf}
\caption{\cond{C3v2} success rate as the sample grew from $n=15$ to $n=50$ and $n=70$, with 95\% Wilson CIs. The $n=15$ estimate sits near the ceiling; the boosted estimates settle near $80$--$83\%$, far below the no-dedup \cond{C2} arm.}
\label{fig:c3v2n50}
\end{figure}

This is also the largest methodological caution we can issue against our own earlier ablation: \cond{C1} and \cond{C2} are reported at $n{=}15$ as $100\%$, with CIs of $[80\%, 100\%]$; their true means could plausibly be $80$--$95\%$. The structure of the rewrite (namespace dominant, gates roughly neutral, dedup non-monotone) is robust, but the magnitudes within the dedup arm are noisier than the $n{=}15$ snapshot suggested.

\subsection{Where transfer fails: \texttt{cube\_stack}}
\label{sec:exp-heldout}

\texttt{cube\_stack} requires the robot to lift \emph{and} stack one cube on another. We ran four conditions ($n{=}10$ each) without re-extracting skills (Table~\ref{tab:cubestack}). Headline: every condition at $0$--$17\%$. \cond{C3v2} is the marginal best ($2/12 = 17\%$). \cond{C3} engages the task least (mean blocks $0.10$, regenerations $0.00$): on most trials the LLM emits no real code and immediately returns \texttt{FINISH}.

\begin{table}[t]
\caption{Held-out transfer to \texttt{cube\_stack}. ``Engage'' is mean code blocks $\times$ mean regens, a rough proxy for how hard the LLM tried.}
\label{tab:cubestack}
\centering
\begin{tabular}{lrrlr}
\toprule
Cond & $n$ & Success & Avg reward & Engage \\
\midrule
\cond{p22\_a} (no skills) & 10 & 0/10 & 0.062 & 3.91 \\
\cond{C2} (14 skills)     & 10 & 0/10 & 0.287 & 1.80 \\
\cond{C3} (11 skills)     & 12 & 0/12 & 0.069 & 0.00 \\
\cond{C3v2} (11, dedup v2) & 12 & 2/12 & 0.249 & 0.00 \\
\bottomrule
\end{tabular}
\end{table}

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig-cubestack-failure-modes.pdf}
\caption{Cube\_stack failure-mode breakdown by condition (Phase~2.4d). Most failures are not in the skill layer: \texttt{scipy.Rotation.from\_dcm} (deprecated API in LLM-generated code), SAM3 422 errors (numpy int64 not JSON serialisable), ``green cube not found'' (SAM3 weakness on green colour), and 1000-second trial timeouts.}
\label{fig:cubestack}
\end{figure}

A debug pass (Phase~2.4d) explains the floor (Figure~\ref{fig:cubestack}). Across the four conditions we identified four dominant failure modes: (i) the LLM emits \texttt{scipy.Rotation.from\_dcm}, deprecated since SciPy 1.6; (ii) SAM3 \texttt{422 Client Error} from numpy \texttt{int64} coordinates that are not JSON-serialisable; (iii) ``green cube not found'' (SAM3 text-prompt weakness on the green block); (iv) 1000-second trial timeouts. None of these are skill-library bugs; they are base-API or LLM-side bugs that floor every condition. \cond{C3v2}'s 2/12 successes show the pipeline \emph{can} stack, so the floor is not absolute -- it is a measurement noise zone in which any skill-library effect we might want to detect is below the SAM3-and-scipy noise.

\subsection{Where transfer fails: LIBERO}
\label{sec:libero}

We attempted a second held-out transfer test on LIBERO Spatial Task~0 (``pick up the black bowl between the plate and the ramekin'') using the same reduced base API as \texttt{cube\_lifting} (identical \texttt{solve\_ik}, \texttt{move\_to\_joints}, \texttt{segment\_sam3\_*}, \texttt{point\_prompt\_molmo}, \texttt{plan\_grasp}) wrapped via a new \texttt{FrankaLiberoApiReducedAutoSkills} adapter that mirrors the cube\_lifting auto-skills wrapper. Three conditions were designed: a 5-trial smoke (\cond{smoke\_a}, no skills), a 10-trial held-out baseline (\cond{heldout\_a}, no skills), and a 10-trial skill-injected condition using the Dedup~v2 11-skill library (\cond{heldout\_c3v2}). The smoke phase ran on an A6000 pod (\$0.49/hr) and produced $5/5$ failures (reward $0.000$, \texttt{taskcompleted=False}) with very short trial code -- average $1.8$ blocks per trial and $0.8$ regenerations -- suggesting the LLM disengaged early once it encountered the small-bowl-on-plate perception challenge. This mirrors the \cond{C3} engagement collapse on \texttt{cube\_stack} (Phase~2.4d, average $0.10$ blocks per trial).

The two held-out conditions \emph{never executed}. A bash interaction in the runner script (a \texttt{set -e} upstream of an \texttt{ls~|~grep~-c~"taskcompleted\_1"~||~echo~0} check that returned exit code $1$ when no match was found) terminated the runner after the smoke phase, before \cond{heldout\_a} or \cond{heldout\_c3v2} could start. The pod was then auto-terminated by the wrap-up timer at $11$ minutes total wall-clock; total cost was $\sim$\$$1.5$ (pod $\sim$\$$0.09$, LLM $\sim$\$$1.4$). We treat this as a \emph{negative methodological observation, not a result}: we did not collect skill-injected data on LIBERO. The interpretation we are entitled to is narrow -- LIBERO Spatial 0 baseline is at floor with our current pipeline (\texttt{gpt-4.1} + reduced API + Molmo + SAM3); we did not measure skill-library transfer.

A follow-up smoke (April 28, robust runner) tested two LIBERO subtasks expected to be simpler: \texttt{libero\_spatial} task 2 (``pick up the black bowl from the table center and place it on the plate'' -- a single-bowl variant with no relational spatial reasoning) and \texttt{libero\_object} task 0 (a different LIBERO suite). Each $n{=}5$, no-skills baseline. Both produced $0/5$ with average reward $0.000$, identical to the original Spatial 0 floor. This suggests the floor is not driven by Spatial-0-specific difficulty but by a broader incompatibility between our SAM3+GraspNet perception pipeline and LIBERO's scene/object distribution. Combined with the \texttt{cube\_stack} floor (Section~\ref{sec:exp-heldout}), this means our skill library's transfer effect is currently unmeasurable at the cube\_lifting $\to$ other-task boundary, and the bottleneck is base-pipeline robustness rather than task selection.

\subsection{Docstrings and task fit: adversarial decoys}
\label{sec:phase33}

To probe Phase~1.5's hypothesis that ``docstrings predict invocation'' as a \emph{causal} statement, we ran a small adversarial test (April 28). We hand-wrote three plausibly-named no-op skills, gave each a complete one-line docstring, faked their metadata to pass the gates (\texttt{occurrences}=2, \texttt{success\_rate}=0.85), and inserted them into the \cond{C3v2} promoted set so the LLM saw 14 promoted skills, three of which were decoys: \texttt{optimize\_grasp\_with\_priors(obs, mask)} returns the identity pose, \texttt{verify\_workspace\_safety(obs, target\_pose)} returns \texttt{True}, and \texttt{compute\_optimal\_lift\_height(obs, mask)} returns the constant 0.15. We then ran cube\_lifting at $n{=}15$ ($\sim$\$$2.5$).

The success rate was $14/15 = 93\%$, indistinguishable from \cond{C3v2}'s baseline at the same $n$. Aggregate decoy-call counts across all trials and retries: \texttt{optimize\_grasp\_with\_priors} = $0$, \texttt{verify\_workspace\_safety} = $8$ (all from a single trial), \texttt{compute\_optimal\_lift\_height} = $0$. The trial-level invocation rate -- ``did the LLM call any decoy at any point in this trial?'' -- is $1/15 = 6.7\%$. The single trial that fell for a decoy used the verb-style \texttt{verify\_*} as a defensive sanity check immediately before each grasp call (twice per grasp, with \texttt{(pregrasp, quat)} and \texttt{(grasp, quat)} as arguments); it succeeded anyway because \texttt{verify\_workspace\_safety} returned \texttt{True} regardless and the actual grasp was still planned by the real \texttt{plan\_and\_select\_grasp} skill. The two decoys whose names suggest \emph{producers} of values (\texttt{optimize\_grasp\_with\_priors} returning a pose, \texttt{compute\_optimal\_lift\_height} returning a float) were not called once in 15 trials.

Two interpretations follow. (i) Plausible name + complete docstring is necessary but not sufficient for invocation. The LLM picks among the 14 promoted skills primarily by task fit, not by surface form -- it ignores producer-style decoys whose outputs would have to be plumbed into downstream calls (and would be silently wrong if used). (ii) The verb-style \texttt{verify\_*} pattern carries a small bias because it is a natural sanity-check call site that does not need to integrate with the rest of the program. We caution against over-reading the $6.7\%$ from $n{=}15$: the binomial CI is $[0\%, 32\%]$ (Wilson). The structural pattern -- producer-style decoys ignored, single verifier-style decoy called only as a wrapper -- is the part we trust.

This refines the dedup-as-survivor framing of Section~\ref{sec:exp-dedupv2}: the docstring signal is the right tiebreaker among functionally equivalent variants because the LLM does prefer documented variants \emph{when both are otherwise plausible}; it is not, however, a strong enough hook to lure the LLM into calling a function it does not need.

\subsection{Docstrings and task fit: forced-choice variants}
\label{sec:phase33-doc}

A complementary test (April 29) targeted the converse question: when the LLM \emph{does} have to pick a skill, does docstring presence drive the pick? We replaced \texttt{lift\_object} in the \cond{C3v2} promoted set with two functionally identical variants -- \texttt{lift\_object\_documented} and \texttt{lift\_object\_undocumented} -- with byte-identical bodies and the only surface difference being a one-line docstring on the former. Other ten skills unchanged; total 12 promoted; $n{=}15$, cube\_lifting.

Result: $15/15 = 100\%$ task success (as expected -- both variants compute the same lift). \emph{Of $20$ total lift-skill calls across all trials and retries, $20$ went to the documented variant and $0$ went to the undocumented variant} (call-level units; at the trial level, $13/15$ trials invoked the documented variant, $0/15$ invoked the undocumented variant, and the remaining two trials used \texttt{execute\_grasp} -- a higher-level skill that already incorporates lift -- and never called either variant). The Wilson 95\% confidence interval on the documented-variant share of calls is approximately $[83\%, 100\%]$.

This 20:0 split is consistent with a causal reading of Phase~1.5's $8\times$ docstring--invocation correlation, but one confound was not isolated by this design alone: the variant names themselves -- \texttt{lift\_object\_documented} and \texttt{lift\_object\_undocumented} -- carry semantic content. The token \emph{undocumented} is itself a self-label that may signal ``do not use,'' so the on/off test by itself conflates docstring presence with name-string semantics.

To isolate the docstring effect from the name-string semantics, we ran a counter-balanced replication on April 30. The same byte-identical lift body was registered under neutral letter-only names \texttt{lift\_object\_a} and \texttt{lift\_object\_b}; in condition \cond{v1} the docstring sat on \texttt{lift\_object\_a}, in condition \cond{v2} it sat on \texttt{lift\_object\_b}; everything else was held constant. Each condition had $n{=}15$. The result was identical in both: \cond{v1} produced $42:0$ in favour of \texttt{lift\_object\_a}, \cond{v2} produced $47:0$ in favour of \texttt{lift\_object\_b}. Pooled across the 30 trials, the docstring-bearing variant captured $89/89 = 100\%$ of lift-skill calls. Letter neutrality also held: total \texttt{a}-calls (42) and total \texttt{b}-calls (47) were essentially balanced ($47\%{:}53\%$). Because the only stimulus that flipped sides between v1 and v2 is the docstring, and the response (which variant the LLM calls) flipped with it perfectly, name-string semantics and letter ordering are both ruled out as drivers.

A multi-backbone replication of the same neutral-name v1 setup on May 1 confirms the effect is not gpt-4.1-specific. With $n{=}15$ each, Claude Sonnet~4 reached $49:0 = 100\%$ doc-share and DeepSeek v3 reached $109:6 = 94.8\%$ doc-share (the slightly lower share on DeepSeek reflects a higher overall call volume rather than a competing variant getting traction; structurally the doc-share remains decisive). All three backbones cleared a $90\%$ doc-share threshold. We therefore treat the docstring-as-tiebreaker effect as an LLM-general mechanism rather than a single-backbone artefact.

Combined with the decoy result, the two-step picture is: \emph{docstrings are causally responsible for invocation among task-fit-equivalent variants, but they are not a sufficient hook to manufacture invocation in the absence of task fit}. Operationally, this is what Dedup v2's \((\textsc{has\_docstring}, \textsc{success\_rate}, \textsc{name})\) survivor rule encodes. The same docstring-causality reading retroactively explains a small mystery from Phase~1.1: the only two ``dead'' promoted skills (\texttt{select\_top\_grasp} and \texttt{grasp\_pose\_to\_ik}, $0/0$ calls) are also the only two skills in the C3v2 promoted set with empty docstrings. The cost of the doc test was \$$0.16$ pod plus $\sim$\$$4$ LLM, run autonomously by a sub-agent including pod creation, experiment, R2 backup, and pod termination.

\subsection{Multi-backbone library effect: D2 closure}
\label{sec:d2-closure}

The v1 cross-backbone story was incomplete: paper~v1 measured the \cond{C2} endpoint on Claude Sonnet~4 ($95.0\%$, $n{=}20$) and DeepSeek v3 ($100.0\%$, $n{=}20$), but did not re-run the no-skills baseline on either backbone, so the library-vs-baseline gap was unmeasured outside \texttt{gpt-4.1}. We close this in v2 with $n{=}50$ measurements of \cond{P21\_a} and \cond{manual\_11} on both backbones (May~4 phase 1+2 real run, $\sim$\$$60$, A40 pod, sequential serial runner). We use \cond{manual\_11} as the library arm because it is the production-recommended quality-ranked subset of \cond{C2} from paper v1.

The result is a strong asymmetry. On Claude Sonnet~4, \cond{P21\_a} reaches $43/50 = 86.0\%$ (CI $[73.8\%, 93.0\%]$) and \cond{manual\_11} reaches $49/50 = 98.0\%$ (CI $[89.5\%, 99.6\%]$): the library effect is $+12$pp. The CIs overlap by $3.2$pp at the boundary, but the one-sided binomial test $P[X{\ge}49\mid n{=}50, p{=}0.86]\approx 0.5\%$ provides strong evidence that the library improves Claude beyond the no-skills baseline. On DeepSeek v3, \cond{P21\_a} reaches only $3/50 = 6.0\%$ (CI $[2.1\%, 16.2\%]$) while \cond{manual\_11} reaches $47/50 = 94.0\%$ (CI $[83.8\%, 97.9\%]$): the library effect is $+88$pp, with CIs fully separated and a one-sided binomial $P\approx 10^{-54}$. Combined with the original \texttt{gpt-4.1} measurement ($+19$pp at $n{=}50/70$), the three-backbone library effects span $+12$pp / $+19$pp / $+88$pp.

Two readings follow. (i) Library magnitude is a function of baseline competence, not a uniform LLM-general gain. The weakest backbone in this specific code-generation regime (DeepSeek v3 at $6\%$ baseline) gains the most from a \emph{good} skill library; the middle backbone (\texttt{gpt-4.1} at $78\%$) gains a moderate amount; the strongest baseline (Claude Sonnet~4 at $86\%$) gains the least. (ii) The DeepSeek result is the largest single-task library benefit observed in this project, and it makes the cross-backbone library claim quantitatively stronger than the paper~v1 endpoint replication: skill libraries help \emph{most} where the underlying model is least competent at the task. The Claude $+12$pp gain is smaller in magnitude but still statistically detectable at $n{=}50$, indicating that the library benefit is non-trivial even where the no-skills baseline is already strong; the apparent ``ceiling'' in paper~v1's endpoint replication ($95\%/100\%$ at $n{=}20$) was therefore not a hard upper bound. We do not extrapolate this gradient to other tasks: it may invert on tasks where Claude is the weak backbone. The mechanism question -- why DeepSeek floors at $6\%$ on cube\_lifting under no-skills, and what specific skill calls bridge it to $94\%$ -- is left as Section~\ref{sec:limitations} follow-up.

\subsection{Dedup~v3 algorithmic robustness}
\label{sec:dedup-v3}

The \cond{manual\_11} arm (paper v1, $n{=}70$, $94.3\%$) demonstrated that a quality-ranked $11$-skill subset of \cond{C2} recovers the performance lost by structural-hash dedup, but the $n{=}70$ measurement was a single point: $k{=}11$ specifically. We test in v2 whether the recovery generalises across nearby cutoffs. The recipe is identical to \cond{manual\_11}: take the $14$ \cond{C2}-promoted skills, rank by \texttt{quality\_score} descending (ties broken alphabetically), keep the top $k$. We measure $k\in\{10, 12, 13\}$ at $n{=}70$ each on \texttt{gpt-4.1}.

Results: $k{=}10$ reaches $64/70 = 91.4\%$ (CI $[82.5\%, 96.0\%]$); $k{=}12$ reaches $61/70 = 87.1\%$ (CI $[77.3\%, 93.1\%]$); $k{=}13$ reaches $68/70 = 97.1\%$ (CI $[90.2\%, 99.2\%]$). Combined with $k{=}11$ from \cond{manual\_11} ($94.3\%$, CI $[86.2\%, 97.8\%]$), the four-cutoff curve is $91.4 / 94.3 / 87.1 / 97.1\%$. We test each against the production structural-dedup library \cond{C3v2} ($58/70 = 82.9\%$, CI $[72.4\%, 89.9\%]$) using one-sided binomial tests: $k{=}11$ ($P[X{\ge}66\mid n{=}70, p{=}0.829]{=}10^{-3}$, CI separated) and $k{=}13$ ($P{=}2\times10^{-4}$, CI separated) are statistically beyond \cond{C3v2}; $k{=}10$ is marginally above ($P{=}0.034$, CI overlaps slightly $82.5$ vs $89.9$); and $k{=}12$ is not separated ($P{=}0.22$, CIs fully overlap).

Two interpretations follow. (i) The \cond{manual\_11} recovery is not a $k{=}11$-specific accident -- it survives moving to $k{=}13$ (still $97.1\%$) -- but the recipe is not uniformly $k$-robust either: $k{=}12$ falls back to a level that is statistically indistinguishable from \cond{C3v2}. We therefore label the rule \emph{Dedup~v3} as ``top-$k$-by-quality\_score with $k\in\{11, 13\}$ giving the strongest separation'' rather than as a uniformly $k$-robust function. The recipe is still implementable as a deterministic post-promotion pass: drop the bottom-($14{-}k$) skills by \texttt{quality\_score} (ties: docstring presence, then alphabetical), but $k{=}12$ should be avoided. (ii) The non-monotone $k{=}12$ dip ($87.1\%$ vs $91.4\%$/$94.3\%$/$97.1\%$ at $k{=}10/11/13$) is non-monotone but mechanism remains open. \emph{An earlier draft of this paper attributed the dip to docstring-less \texttt{get\_grasp\_pose\_for\_mask} (q=0.864) being kept in the $k{=}12$ namespace; that attribution was empirically self-refuted before publication.} The actual marginal change at $k{=}11{\to}12$ is the \emph{addition} of \texttt{pixel\_mask\_to\_world\_points} ($q{=}0.878$, \emph{has} docstring), and \texttt{get\_grasp\_pose\_for\_mask} only enters the namespace at $k{=}13$ (where it correlates with the recovery, not the dip). The simpler ``docstring-as-tiebreaker'' mechanism (paper~v1 §VI.C) does not cleanly explain $k{=}12$: \cond{manual\_11} ($k{=}11$) also keeps the docstring-less \texttt{select\_top\_grasp} ($q{=}0.900$) without a corresponding dip. We therefore record the $k{=}12$ result as a non-monotone empirical finding and leave its mechanism (e.g., function-semantic conflict from \texttt{pixel\_mask\_to\_world\_points}, namespace-size sensitivity, or sampling variance) as future work. The micro-test originally planned for this hypothesis (swap-only $n{=}30$ on the targeted skill) was cancelled as the targeting was based on the refuted attribution.

The practical recommendation is therefore narrower than first glance suggested: ship a Dedup~v3 stage that ranks survivors by \texttt{quality\_score} (with docstring tie-breaks) rather than by structural signature, but choose $k$ near $11$ or $13$ rather than $12$. Paper~v1's claim that ``a smarter dedup is possible'' is now an algorithmic recipe with a measured non-monotone $k$-sensitivity, not a uniformly robust function.

\subsection{Smoke evaluations: cross-task, multi-session, mechanism}
\label{sec:smoke-phase2}

Three pre-registered ``smoke'' micro-evaluations probe the boundaries of paper~v2's claims (full pre-registration in \texttt{docs/superpowers/specs/2026-05-06-capx-closing-design.md}, observation logs in \texttt{docs/superpowers/observations/2026-05-08-smoke\{1,2,3\}-*.md}). They are deliberately small ($n{=}3$--$30$) and are reported as boundaries on the v2 claims, not as new positive results.

\subsubsection{Cross-task transfer: \texttt{cube\_stack\_3}}

We add a new task \texttt{cube\_stack\_3} (red-on-green-on-blue stacking, 3 cubes, sequential lift-and-place) and measure both the no-skills baseline (\cond{P21\_a}) and the production library arm (\cond{manual\_11}) at $n{=}15$ each on \texttt{gpt-4.1}. Both arms reach $0/15$ \emph{Task completed}: pre-registered ``Fail (floor)'' outcome (baseline ${\le}20\%$, spec~§3.2a). Reward distribution at the sandbox-attempt level is informative: \cond{P21\_a} achieves a single stacking on $6/25$ attempts, \cond{manual\_11} on $4/21$; the second stacking (full task) is reached on neither.

Inspection of trial logs identifies the proximate cause as a \emph{vision pipeline} saturation rather than LLM logic. All three cubes are physically present in the simulator (verified by direct \texttt{sim.data.xpos} reads), but \texttt{sam3} segmentation produces highly fragmented per-cube masks (${>}200$ small mask entries with many zero-pixel false positives) when the scene contains multiple chromatically distinct primitives. The downstream \texttt{pixel\_mask\_to\_world\_points} and \texttt{get\_grasp\_pose\_for\_mask} functions require a single contiguous mask per cube, which they do not get. \texttt{cube\_lifting} (single red cube) trivially passes mask consolidation; \texttt{cube\_stack\_3} does not. This refines the v1/v2 transfer story: paper~v1's \texttt{cube\_stack} floor and the present \texttt{cube\_stack\_3} floor are not the same as the LIBERO floor (Section~\ref{sec:libero}), which persists even with privileged-API state. The \texttt{cube\_stack*} family is \emph{vision-pipeline-bound}; LIBERO is \emph{not perception-only-bound}.

A secondary finding is that the library produces \emph{code-efficiency} gains even when both arms floor on task completion. \cond{manual\_11} reduces average code blocks per trial from $5.73$ to $4.13$ ($-27.9\%$), regenerations from $4.73$ to $3.13$ ($-33.8\%$), and wall time from $2338$\,s to $1777$\,s ($-24\%$). The library makes the LLM converge faster on a partial solution; this is a \emph{decoupling} of ``library helps task-success-rate'' from ``library helps code-generation cost'' that the headline numbers in Sections~\ref{sec:exp-ablation} and~\ref{sec:d2-closure} cannot establish (those sections measure only task completion, where the library effect mixes both gains).

\subsubsection{Multi-session refinement: namespace saturation}

We attempt round-2 mining (mine new skills from round-1's fail trials, add to round-1's promoted set) on a mid-range round-1 baseline. The pre-registered round-1 (\cond{C3v2}, $n{=}70$, $82.9\%$) is unavailable because raw trial outputs for that condition were not retained in cold storage; we substitute \cond{dedup\_v3\_k12} (paper v2 $87.1\%$, $n{=}50$) as the round-1, since it is also mid-range and its outputs are preserved. The substitution is documented and discussed in Section~\ref{sec:limitations}.

The mining stage produces \emph{zero promotable new skills}. AST extraction over the $3$ available fail-trial \texttt{code.py} files finds no top-level function definitions: when handed a dense library covering the entire pick-and-place workflow, the LLM composes it imperatively (sequences of API calls) rather than wrapping skills in higher-order helpers. The round-2 promoted set is identical to round-1; we did not run a round-2 trial sweep, since it would be a replication of round-1.

This is a \emph{stronger negative} than the spec's pre-registered Pass/Marginal/Fail structure anticipated. We interpret it as evidence of a \emph{namespace saturation} regime: paper~v2's auto-mining loop (mine $\to$ promote $\to$ add) implicitly assumes trial code contains novel functions. As library density grows, this assumption fails monotonically; \texttt{cube\_lifting} under \texttt{gpt-4.1} with $k{=}12$ already sits at or past the saturation point. Further \texttt{mine-and-add} iterations will not grow the library from this baseline. The correct response is either (i)~switch the mining target to a harder task (where the library no longer covers the surface, restoring the abstraction-finding signal -- cf.\ the \texttt{cube\_stack\_3} floor above, which the present pipeline cannot reach but which \emph{should} produce mineable code if perception were fixed), or (ii)~replace \texttt{def}-extraction with semantic-similarity clustering of API-call sequences. We leave both as paper-v3 candidates.

\subsubsection{Backbone mechanism: partial-grip-and-drop versus robust completion}

We re-run \cond{P21\_a} on cube\_lifting with $n{=}3$ each on DeepSeek v3 and Claude Sonnet~4 and inspect the per-attempt reward distribution. Outcomes match paper~v2's Section~\ref{sec:d2-closure} numbers within sampling variance (DeepSeek $0/3$, P${=}0.83$ given paper rate $6\%$; Claude $3/3$, P${=}0.64$ given paper rate $86\%$). The mechanism signal is in the \emph{reward profile}, not the binary task-completion count.

Cube\_lifting's reward shaping rewards partial grip + lift attempts: rewards $0.4$--$0.6$ correspond to ``approached + partial grip + dropped during lift''; $0.7$--$0.9$ to ``successful grip + partial lift''; $1.0$ to ``cube held above task-complete height.'' DeepSeek's $5$ failing sandbox attempts span $0.526$ / $0.527$ / $0.534$ / $0.701$ / $0.707$: gripping logic is sound, lift trajectories are unstable. Claude's $5$ failing attempts span $0.478$--$0.546$, the same partial-grip-and-drop class, but its $3$ successful attempts (one per trial) reach $1.000$ on the second or third regeneration. Both backbones make first-attempt errors of the same class; their divergence is in the regeneration loop.

Read together with paper~v2's Section~\ref{sec:d2-closure}, this suggests a sharper interpretation of the multi-backbone library asymmetry. DeepSeek's $6\%$ baseline is not a reasoning failure (it does grip and partly lift); it is a self-correction failure (its regenerations cycle through similar partial solutions without converging). The library's $+88$pp benefit on DeepSeek is plausibly large because the library replaces DeepSeek's broken regeneration loop with a fixed, correct skill -- bypassing the iteration mechanism entirely rather than augmenting it. Claude's smaller $+12$pp benefit is consistent with its already-effective $1$--$2$ regeneration convergence: the library accelerates this but is partly redundant. We register this as a falsifiable mechanism story (Section~\ref{sec:limitations}, paper-v3 follow-up): if true, then improving DeepSeek's iteration (e.g., reflection-style regeneration prompting) should reduce the library effect on DeepSeek.

\section{Discussion}
\label{sec:discussion}

\subsection{Claim ladder for v1}

The central evidence supports a narrow positive result and several important negative boundaries. The positive result is that no-dedup mined-skill libraries improve \texttt{cube\_lifting}: the namespace-enabled no-dedup arm (\cond{C1}) and gated no-dedup arm (\cond{C2}) remain near $96$--$97\%$ at $n{=}70$, while the powered no-skill baseline (\cond{P21\_a}) is $78.0\%$. The typed empty-stub control (\cond{empty\_ns}) makes this a content claim rather than a namespace claim: typed function surfaces without bodies fall to $43.3\%$.

The boundary is survivor selection. The production structural-dedup library (\cond{C3v2}) reaches only $82.9\%$ and is not clearly above baseline. In contrast, the quality-ranked same-size library (\cond{manual\_11}) keeps the same library size but changes the survivors and reaches $94.3\%$. This pattern aligns with the library-use audits: dead skills lack docstrings, functionally equivalent variants compete on documentation and task fit, and structural hashes can merge variants that are not equivalent from the LLM's invocation perspective. A better Dedup~v3 should therefore rank survivors by execution quality, docstring/task-fit cues, and downstream call evidence rather than by structural signature alone.

\subsection{Why the original recovery story is insufficient}

The historical Group C $\to$ Group D jump is useful only as a hypothesis generator. It combined namespace seeding, gates, and dedup, so it could not identify which mechanism helped. The controlled ablation shows a more conventional causal picture: namespace seeding repairs a real runtime bug; quality gates matter upstream by pruning the candidate pool; and structural dedup is non-monotone because it chooses which variant remains visible to the LLM. The methodological lesson is simple: a single high-performing run after multiple harness changes should not be used as causal evidence without isolated controls.

\subsection{Ceiling, floor, and the next measurement regime}

\texttt{cube\_lifting} now has enough headroom to measure the no-dedup skill win, but it is still only one task. \texttt{cube\_stack} and LIBERO do not yet provide clean transfer measurements because both sit at or near floor under the present harness. The next decisive experiment is not merely ``more samples''; it is a medium-difficulty transfer task where the no-skills baseline lies roughly between $50$ and $80\%$. In that regime, a Dedup~v3 library can be tested for a $10$pp-scale gain without ceiling or floor masking the effect.

\section{Limitations}
\label{sec:limitations}

\textbf{Single skill-extraction task.} All 11 promoted skills were mined from \texttt{cube\_lifting} trials. Generalisation tests (cube\_stack, LIBERO) are transfer-only; we do not re-mine on the new task. A study of cumulative skill mining across many tasks (Q3) is future work.

\textbf{Sample sizes.} \cond{P21\_a} is at $n{=}50$, \cond{C1}/\cond{C2}/\cond{C3v2} at $n{=}70$, \cond{empty\_ns} and \cond{group\_a\_repro} at $n{=}30$. \cond{C0} and \cond{C3} remain at $n{=}15$ from the original 4-condition ablation; their magnitudes within the dedup arm should be read with care. The Phase~G boost confirmed the May~1 \cond{C1}/\cond{C2} $96\%$ was real, not a lucky sample.

\textbf{Stochasticity in \texttt{gpt-4.1}.} OpenRouter does not pin a snapshot for \texttt{gpt-4.1}, so model identity drifts on the scale of weeks. The empty-VDM-reasoning artifact (Phase~0.3) is partly a consequence: \texttt{gpt-4.1} via OpenRouter does not fill the \texttt{reasoning} field, leaving the VDM response to the prompt format alone. We did not control for snapshot.

\textbf{Cross-backbone baseline delta (resolved in v2).} Paper~v1 replicated the \cond{C2} endpoint on Claude Sonnet~4 and DeepSeek~v3 but did not re-run the no-skills baseline. The v2 D2 closure (Section~\ref{sec:d2-closure}) measures both baselines and the \cond{manual\_11} library at $n{=}50$ each, yielding library effects of $+12$pp on Claude (one-sided $P{=}0.5\%$, CIs overlap by $3.2$pp at boundary) and $+88$pp on DeepSeek (CIs fully separated, $P\approx 10^{-54}$). What remains open: the \emph{mechanism} of DeepSeek's $6\%$ baseline floor on cube\_lifting (perception parsing? code generation pattern? prompt format compliance?) and the cleaner separation of the Claude $+12$pp gain (a $n{=}100$ replication would resolve the boundary CI overlap). These are the natural follow-ups for paper~v3.

\textbf{VDM weaknesses.} On \texttt{cube\_stack} the SAM3 segmentation step fails on ``green cube'' more often than on red cube, biasing the floor. Our pipeline does not attempt to detect or compensate for this.

\textbf{No long-horizon evolution data.} Q3 (``how does the library evolve?'') is currently unanswerable from a single 50-trial extraction cycle. Phase~3 (long-horizon, $n{=}100$--$200$, library snapshots every $N$ trials) is on the roadmap but has not been run.

\textbf{LIBERO incompatibility is not a perception problem alone.} The Phase~2.5 LIBERO experiment ran only the smoke condition ($n{=}5$, all $0$); two simpler subtasks (\texttt{libero\_spatial} task 2, \texttt{libero\_object} task 0) at $n{=}5$ each also produced $0$. A privileged-API smoke on May~3 (\cond{libero\_priv}, $n{=}10$, \texttt{FrankaLiberoPrivilegedApi} with ground-truth poses bypassing perception) reaches only $1/10 = 10\%$ (CI $[1.8\%, 40.4\%]$), still effectively at floor. So the LIBERO floor is not perception-only -- it persists when perception is replaced by privileged state -- and the deeper cause likely involves the control API, the prompt format, or task semantics specific to the LIBERO simulator. Skill-library transfer beyond cube\_lifting therefore remains unmeasured, and the path to measuring it requires environment work rather than just additional sample size.

\textbf{Phase~3.3 sample sizes.} The decoy and on/off subsections use $n{=}15$ each; the neutral-name doc replication is $n{=}15$ per side ($n{=}30$ pooled), with multi-backbone follow-ups at $n{=}15$ per backbone. The decoy invocation rate of $1/15 = 6.7\%$ has a wide Wilson 95\% CI of $[0\%, 32\%]$; the structural observation (producer-style decoys ignored, verifier-style called once as a wrapper) is the part we trust. The pooled $89:0$ doc split (counter-balanced gpt-4.1) and the three-backbone replication ($100\%/100\%/94.8\%$) are jointly decisive at $\alpha < 10^{-4}$. Generalisation to other tasks and to other docstring qualities (short / long / misleading) is not yet measured.

\textbf{Vision-pipeline ceiling on cube\_lifting.} The new \texttt{cube\_stack\_3} floor (Section~\ref{sec:smoke-phase2}) localises the bottleneck to \texttt{sam3} mask consolidation, not LLM reasoning or library content. By implication, \texttt{cube\_lifting}'s $90\%$+ rates may be partly a perception-easy regime (single chromatically distinct cube on neutral background) rather than a capability-genuine regime. Future work should test whether per-cube SAM prompts with class-conditioned anchors, contact-graspnet's instance-aware pose estimation, or an alternative segmentation backbone (e.g., grounded-segment-anything with explicit class labels) restore measurable gradients on multi-object tasks.

\textbf{Multi-session mining hits saturation.} The round-2 attempt (Section~\ref{sec:smoke-phase2}) returned $0$ promotable new skills from a mid-range library's fail-trial code, because dense libraries induce purely imperative LLM code with no top-level function defs. Paper~v2's auto-mining loop has a measured saturation regime; further \texttt{mine-and-add} iterations on \texttt{cube\_lifting} under \texttt{gpt-4.1} from $k{=}12$ will not grow the namespace. Two paper-v3 candidates: (i)~run round-2 mining on a harder task (where library coverage no longer saturates the surface), and (ii)~replace \texttt{def}-extraction with semantic-similarity clustering of API-call sequences.

\textbf{Backup-protocol gap on \cond{C3v2} raw outputs.} The pre-registered round-2 baseline (\cond{C3v2}, $n{=}70$, $82.9\%$) was substituted by \cond{dedup\_v3\_k12} ($n{=}50$, $87.1\%$) because \cond{C3v2}'s raw trial outputs were not retained in cold storage at any phase backup, although the skill-set JSON was preserved. The substitution preserves the scientific question (mid-range round-1 baseline, same backbone, same task) but is a strict deviation from the spec's exact pre-registration. The closing retrospective records the underlying gap (per-condition output retention is incomplete across some prior phases) as a process item, not an experimental one.

\section{Conclusion}
\label{sec:conclusion}

Starting from NVIDIA CaP-X, we tested whether an LLM robot-code agent benefits from a mined executable skill library. In the \texttt{cube\_lifting} regime, the answer is yes under no-dedup selection: the gated no-dedup mined-skill library (\cond{C2}) reaches $68/70 = 97.1\%$ versus the powered no-skill baseline (\cond{P21\_a}) at $39/50 = 78.0\%$, and the typed empty-stub control (\cond{empty\_ns}) confirms that executable function bodies are responsible for much of the effect.

The result is not a broad transfer claim. The same evidence shows that structural survivor selection can erase the gain: the production structural-dedup library (\cond{C3v2}) falls to $58/70 = 82.9\%$, while the quality-ranked same-size library (\cond{manual\_11}) recovers $66/70 = 94.3\%$. Transfer to \texttt{cube\_stack} and LIBERO remains unmeasured because both tasks floor under the current harness.

A v2 follow-up resolves two of paper~v1's largest boundaries (Sections~\ref{sec:d2-closure}, \ref{sec:dedup-v3}) and adds three smoke micro-evaluations (Section~\ref{sec:smoke-phase2}) that bound the v2 claims. \emph{D2 closure.} On Claude Sonnet~4 the library effect is $+12$pp ($98\%$ vs $86\%$, one-sided $P\approx 0.5\%$, CIs overlap by $3.2$pp at boundary); on DeepSeek v3 it is $+88$pp ($94\%$ vs $6\%$, CIs fully separated, $P\approx 10^{-54}$). Together with paper~v1's \texttt{gpt-4.1} $+19$pp result, the three-backbone library effects span $+12 / +19 / +88$pp -- magnitude tracks how weak the no-skills baseline is, not a uniform LLM-general factor. The DeepSeek $+88$pp is the largest single-task library benefit observed in this project. \emph{Dedup~v3 algorithmic robustness.} Paper v1's manual-11 quality-ranked recipe partially generalises: top-$k$-by-quality\_score for $k\in\{10,11,12,13\}$ at $n{=}70$ each reaches $91.4\%/94.3\%/87.1\%/97.1\%$. By one-sided binomial tests against C3v2 ($82.9\%$), $k{=}11$ ($P{=}10^{-3}$) and $k{=}13$ ($P{=}2{\times}10^{-4}$) are statistically beyond C3v2; $k{=}10$ is marginal ($P{=}0.034$); $k{=}12$ is not separated ($P{=}0.22$). The smart dedup is therefore an algorithmic recipe with measured non-monotone $k$-sensitivity rather than a uniformly robust function -- production should pick $k$ near $11$ or $13$. The next paper-worthy question is whether this recipe transfers off \texttt{cube\_lifting} -- i.e.\ whether quality\_score-ranked library mining works on a medium-difficulty task where the no-skills baseline lies between $30$ and $80\%$. \emph{Smoke micro-evaluations.} A new \texttt{cube\_stack\_3} task floors at $0/15$ for both no-skills and library arms, with the bottleneck in \texttt{sam3} mask consolidation rather than LLM reasoning or library content; the library still confers a $24$--$34\%$ code-efficiency reduction (fewer code blocks, regenerations, wall time) at the floor, decoupling library effects on code generation from library effects on task success. A multi-session round-2 mining attempt on a mid-range library returns $0$ promotable new skills, identifying a namespace saturation regime where dense libraries induce purely imperative LLM code with nothing left to extract. A verbose mechanism trace ($n{=}3$ each on Claude / DeepSeek) suggests the multi-backbone library asymmetry tracks self-correction ability rather than reasoning ability: both backbones make the same first-attempt errors, but Claude's regeneration loop converges within $1$--$2$ iterations while DeepSeek's cycles through similar partial solutions, and the library's $+88$pp DeepSeek benefit plausibly substitutes for the broken iteration mechanism.

\section*{Acknowledgements}

This project was run by an independent team. We thank the open-source maintainers of robosuite, SciPy, and the OpenRouter community for the surrounding tooling, and the methodology reviewers whose feedback pushed the ablation controls and transfer boundaries reported here.

\textbf{AI assistance disclosure.} Anthropic's Claude (Opus 4.7, 1M-context variant) was used as a coding and writing assistant throughout this work, including as the policy LLM under test (\texttt{claude-sonnet-4} in the multi-backbone replication) and as a code-generation / drafting / methodology-review assistant on the human-supervised side. The human author ({\tt realkim93}) is responsible for all experimental decisions, claims, and text in this paper. No employee or representative of Anthropic was involved beyond the public Claude product; Anthropic is not an author or affiliation.


\begin{thebibliography}{99}

\bibitem{capx2026}
M.~Fu \emph{et al.}, ``CaP-X: A Framework for Benchmarking and Improving Coding Agents for Robot Manipulation,'' arXiv:2603.22435 [cs.RO], doi:10.48550/arXiv.2603.22435, 2026.

\bibitem{liang2023code}
J.~Liang, W.~Huang, F.~Xia, P.~Xu, K.~Hausman, B.~Ichter, P.~Florence, and A.~Zeng, ``Code as Policies: Language Model Programs for Embodied Control,'' \emph{IEEE ICRA}, 2023; arXiv:2209.07753.

\bibitem{voyager}
G.~Wang \emph{et al.}, ``Voyager: An Open-Ended Embodied Agent with Large Language Models,'' arXiv preprint arXiv:2305.16291, 2023.

\bibitem{sweagent}
J.~Yang \emph{et al.}, ``SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering,'' arXiv preprint arXiv:2405.15793, 2024.

\bibitem{openhands}
X.~Wang \emph{et al.}, ``OpenHands: An Open Platform for AI Software Developers as Generalist Agents,'' arXiv preprint arXiv:2407.16741, 2024.

\bibitem{robosuite2020}
Y.~Zhu, J.~Wong, A.~Mandlekar, and R.~Mart\'{i}n-Mart\'{i}n, ``robosuite: A Modular Simulation Framework and Benchmark for Robot Learning,'' arXiv preprint arXiv:2009.12293, 2020.

\bibitem{libero2023}
B.~Liu \emph{et al.}, ``LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning,'' \emph{Advances in Neural Information Processing Systems}, 2023; arXiv:2306.03310.

\bibitem{wilson1927}
E.~B. Wilson, ``Probable Inference, the Law of Succession, and Statistical Inference,'' \emph{Journal of the American Statistical Association}, vol.~22, no.~158, pp.~209--212, 1927.

\end{thebibliography}

\end{document}
