From 5c892a75074cd7b25632500668fd60f5330ca841 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Sun, 3 May 2026 15:14:18 +0200
Subject: [PATCH] Audit docs, drop dead code, add snippet linter

Fix two stale identifier references (Py_GIL_OWN, multi_executor fallback),
delete priv/_erlang_impl/_ssl.py and three uncalled py_util/0,1,2,3
exports, and repair a broken SharedDict example in docs/shared-dict.md.
Add tests for py:cast/4, py:async_gather/2 and py:dup_fd/1, plus
test/coverage_audit.md mapping every public API to its suite.
New scripts/lint_doc_snippets.escript validates py:Fn/N calls and
Python syntax in fenced blocks; wired into CI and a Makefile target.
---
 .github/workflows/ci.yml          |   3 +
 Makefile                          |  20 ++
 docs/migration.md                 |   9 +-
 docs/owngil_internals.md          |  61 ++++--
 docs/scalability.md               |   8 +-
 docs/security.md                  |   2 +
 docs/shared-dict.md               |  17 +-
 priv/_erlang_impl/_ssl.py         | 329 ------------------------------
 scripts/lint_doc_snippets.escript | 257 +++++++++++++++++++++++
 src/py_util.erl                   |  32 +--
 test/coverage_audit.md            |  88 ++++++++
 test/py_SUITE.erl                 |  26 +++
 test/py_fd_ops_SUITE.erl          |  38 +++-
 13 files changed, 502 insertions(+), 388 deletions(-)
 create mode 100644 Makefile
 delete mode 100644 priv/_erlang_impl/_ssl.py
 create mode 100755 scripts/lint_doc_snippets.escript
 create mode 100644 test/coverage_audit.md

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e14e356..c7771a8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -92,6 +92,9 @@ jobs:
       - name: Run xref
         run: rebar3 xref
 
+      - name: Lint docs
+        run: escript scripts/lint_doc_snippets.escript
+
   # FreeBSD test using cross-platform action
   test-freebsd:
     name: FreeBSD 14 / Python ${{ matrix.python }}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..925c1ea
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+.PHONY: all compile test lint-docs clean
+
+all: compile
+
+compile:
+	rebar3 compile
+
+test:
+	rebar3 ct --readable=compact
+
+# Validate fenced code blocks in README.md and docs/*.md.
+# Erlang `py:Fn(...)` calls must reference a real export at the right
+# arity; Python blocks must parse (IndentationError tolerated for
+# tutorial fragments). Mark a block to skip with `<!-- skip-lint -->`
+# on the line immediately above the opening fence.
+lint-docs: compile
+	escript scripts/lint_doc_snippets.escript
+
+clean:
+	rebar3 clean
diff --git a/docs/migration.md b/docs/migration.md
index 983bcf3..5f6e1e6 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -38,6 +38,7 @@ application:set_env(erlang_python, context_mode, owngil).
 
 **`py:num_executors/0`** - Removed. Contexts now use per-context worker threads.
 
+<!-- skip-lint -->
 ```erlang
 %% v2.x - check executor count
 N = py:num_executors().
@@ -254,6 +255,7 @@ N = py_context_router:num_contexts().
 The function for non-blocking Python calls has been renamed to follow gen_server conventions:
 
 **Before (v1.8.x):**
+<!-- skip-lint -->
 ```erlang
 Ref = py:call_async(math, factorial, [100]),
 {ok, Result} = py:await(Ref).
@@ -355,6 +357,7 @@ For more sophisticated web framework integration, consider the [Reactor API](rea
 The process-binding functions have been removed. The new architecture uses `py_context_router` for automatic scheduler-affinity routing.
 
 **Before (v1.8.x):**
+<!-- skip-lint -->
 ```erlang
 ok = py:bind(),
 ok = py:exec(<<"x = 42">>),
@@ -760,9 +763,9 @@ ImportError: module does not support subinterpreters
 ```
 
 Options:
-1. Use Python < 3.12 (falls back to multi_executor mode)
-2. Check if the library has a subinterpreter-compatible version
-3. Isolate the library usage to a single context
+1. Use Python 3.12 or 3.13: the runtime uses `worker` mode (subinterpreters require Python 3.14+).
+2. Check if the library has a subinterpreter-compatible version.
+3. Isolate the library usage to a single context.
 
 ### Python 3.14: `erlang_loop_import_failed`
 
diff --git a/docs/owngil_internals.md b/docs/owngil_internals.md
index 7ee0382..03ee606 100644
--- a/docs/owngil_internals.md
+++ b/docs/owngil_internals.md
@@ -425,22 +425,53 @@ class EchoProtocol(reactor.Protocol):
 
 ## Performance Characteristics
 
-| Operation | Shared-GIL | OWN_GIL |
-|-----------|-----------|---------|
+| Operation | Worker (shared GIL) | OWN_GIL |
+|-----------|--------------------|---------|
 | Call overhead | ~2.5μs | ~10μs |
-| Throughput (single) | 400K/s | 100K/s |
-| Parallelism | None | True |
-| Resource usage | Lower | Higher (1 pthread per context) |
-
-Use OWN_GIL when:
-- CPU-bound Python work that benefits from parallelism
-- Long-running computations
-- Need true concurrent Python execution
-
-Use worker mode when:
-- I/O-bound or short operations
-- High call frequency
-- Resource constraints
+| Throughput (single context) | ~400K/s | ~100K/s |
+| Parallelism (N contexts) | GIL-bound | Linear up to N cores |
+| Resource usage | One pthread per context | One pthread + one subinterpreter per context |
+
+## Pros and Cons
+
+### Pros
+
+- **True CPU parallelism.** Each context owns its GIL, so N contexts run on N cores at once. Worker mode serialises on the main GIL unless Python is built free-threaded (3.13t+).
+- **Crash isolation.** A C-level fault in one subinterpreter leaves the others alive. Worker mode shares the main interpreter, so a corrupt module state can take everything down.
+- **Clean namespace per context.** Each subinterpreter has its own `sys.modules`, so module-level state cannot bleed between contexts. Useful when running adversarial or untrusted code paths side by side.
+- **Predictable scheduling.** Requests are dispatched via mutex/condvar IPC, not dirty schedulers, so OWN_GIL contexts will not be starved by other dirty NIF traffic.
+
+### Cons
+
+- **Python 3.14+ only.** Earlier versions have C-extension global-state bugs (`_decimal`, `numpy`, etc.) that crash inside subinterpreters. See [cpython#106078](https://github.com/python/cpython/issues/106078).
+- **Higher per-call latency.** ~4x the round-trip cost of worker mode (~10μs vs ~2.5μs) because every call crosses a mutex/condvar handoff to the dedicated thread.
+- **Higher memory.** Each subinterpreter imports its own copy of every module. A 50 MB module set across 8 contexts is ~400 MB resident, not 50 MB.
+- **C-extension compatibility is not universal.** Extensions must opt in via the multi-phase init protocol (PEP 489) and `Py_mod_multiple_interpreters`. Pure-Python and well-behaved C extensions work; older ones fail at import inside the subinterpreter.
+- **No shared Python state.** Module globals, class definitions, and cached objects are per-interpreter. Use `py:state_store/2` (ETS-backed) or `erlang.send` for cross-context data.
+- **Callback re-entry is restricted.** When Python in an OWN_GIL context calls `erlang.call`, the callback runs on a thread worker, not back on the OWN_GIL thread (which cannot suspend). Re-entrant Python -> Erlang -> *same* OWN_GIL context calls will not work; use a different context for the nested call, or use `erlang.async_call` from asyncio code.
+- **Process-local envs do not span interpreters.** A `py_env_resource_t` is bound to the interpreter that created it. Reusing one across contexts returns `{error, env_wrong_interpreter}`.
+
+### When to Use Each
+
+Use **OWN_GIL** when:
+
+- The workload is CPU-bound Python (ML inference, numpy/torch compute, parsing, codecs) and you want N-way parallelism per BEAM scheduler.
+- You can pin the per-context memory budget and the modules in use are subinterpreter-safe.
+- You are on Python 3.14+.
+
+Use **worker** (default) when:
+
+- You are on Python 3.12 or 3.13.
+- Calls are short and frequent (every microsecond of overhead matters).
+- You are running modules that are not subinterpreter-safe (some scientific stacks, older C extensions).
+- You are already running free-threaded Python (3.13t+); worker mode gets parallelism for free without the per-interpreter memory cost.
+
+### Common Pitfalls
+
+- **Importing once is not enough.** Imports happen per subinterpreter. Pre-warming a worker context will not pre-warm the OWN_GIL contexts; do it inside each `py_context`.
+- **Sharing Python objects across contexts.** Passing a `PyObject*` reference (via `py_state` or otherwise) between OWN_GIL contexts is undefined behaviour. Round-trip through Erlang terms or ETS-backed state.
+- **Long-running tasks block the dispatcher.** A single OWN_GIL context processes one request at a time. If you have a 30-second compute job, parallelise across contexts; do not queue everything onto context 1.
+- **Callback storms.** Heavy `erlang.call` use inside an OWN_GIL context routes to thread workers, which is fine, but the round-trip cost is then worker-style on top of OWN_GIL dispatch. For tight callback loops, prefer worker mode end-to-end.
 
 ## Benchmarking
 
diff --git a/docs/scalability.md b/docs/scalability.md
index a5c7eeb..cc32347 100644
--- a/docs/scalability.md
+++ b/docs/scalability.md
@@ -108,6 +108,10 @@ Ctx = py:context(1),
 - Higher memory usage (each interpreter loads modules separately)
 - Some C extensions don't support subinterpreters
 - Requires Python 3.14+
+- Higher per-call latency (~4x worker)
+- Callback re-entry to the same context is restricted (`erlang.call` from inside an OWN_GIL context routes to a thread worker, not back to that context)
+
+For a fuller breakdown of OWN_GIL tradeoffs, common pitfalls, and a usage decision guide, see [OWN_GIL Internals: Pros and Cons](owngil_internals.md#pros-and-cons).
 
 ## Subinterpreter Architecture
 
@@ -144,7 +148,7 @@ Ctx = py:context(1),
 │  │ └──────────┘ │  │ └──────────┘ │  │ └──────────┘ │         │
 │  └──────────────┘  └──────────────┘  └──────────────┘         │
 │                                                                 │
-│  Each thread owns its interpreter's GIL (Py_GIL_OWN)           │
+│  Each thread owns its GIL (PyInterpreterConfig_OWN_GIL)        │
 │  No GIL contention between threads                              │
 └─────────────────────────────────────────────────────────────────┘
 ```
@@ -155,7 +159,7 @@ Ctx = py:context(1),
 
 **py_context_process**: Gen_server that owns a Python context reference and handles call/eval/exec operations.
 
-**Subinterpreter Thread Pool (C)**: Manages N threads, each with its own Python subinterpreter created with `Py_NewInterpreterFromConfig()` and `Py_GIL_OWN`.
+**Subinterpreter Thread Pool (C)**: Manages N threads, each with its own Python subinterpreter created with `Py_NewInterpreterFromConfig()` and `PyInterpreterConfig_OWN_GIL`.
 
 ### Request Flow
 
diff --git a/docs/security.md b/docs/security.md
index a9d52f0..5ba726a 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -42,6 +42,7 @@ This provides defense-in-depth - even if Python code tries to import `os` or `su
 
 When blocked operations are attempted, you'll see:
 
+<!-- skip-lint -->
 ```python
 >>> import subprocess
 >>> subprocess.run(['ls'])
@@ -50,6 +51,7 @@ fork()/exec() would corrupt the Erlang runtime.
 Use Erlang ports (open_port/2) for subprocess management.
 ```
 
+<!-- skip-lint -->
 ```python
 >>> import os
 >>> os.fork()
diff --git a/docs/shared-dict.md b/docs/shared-dict.md
index dddd25b..e94abe7 100644
--- a/docs/shared-dict.md
+++ b/docs/shared-dict.md
@@ -279,16 +279,21 @@ ok = py:shared_dict_destroy(Session).
 %% Create shared cache
 {ok, Cache} = py:shared_dict_new(),
 
-%% Python can populate the cache
+%% Inject the handle into Python globals (py:exec/1 has no locals
+%% argument, so we stash it via py:eval with a side effect).
+{ok, _} = py:eval(
+    <<"(globals().__setitem__('_cache_handle', handle), None)[-1]">>,
+    #{handle => Cache}),
+
+%% Python can now populate the cache
 ok = py:exec(<<"
 from erlang import SharedDict
-cache = SharedDict(handle)
-cache['computed'] = expensive_computation()
-">>,
-ok = py:eval(<<"1">>, #{<<"handle">> => Cache}),
+cache = SharedDict(_cache_handle)
+cache['computed'] = 42
+">>),
 
 %% Erlang can read cached values
-CachedValue = py:shared_dict_get(Cache, <<"computed">>).
+42 = py:shared_dict_get(Cache, <<"computed">>).
 ```
 
 ## See Also
diff --git a/priv/_erlang_impl/_ssl.py b/priv/_erlang_impl/_ssl.py
deleted file mode 100644
index 133428f..0000000
--- a/priv/_erlang_impl/_ssl.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Copyright 2026 Benoit Chesneau
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-SSL/TLS transport using ssl.MemoryBIO.
-
-This module provides SSL/TLS support that works with the Erlang event
-loop by using Python's ssl.MemoryBIO for encryption while letting
-Erlang handle the socket I/O.
-
-Architecture:
-- Raw socket data flows through Erlang (enif_select)
-- Encryption/decryption happens in Python via MemoryBIO
-- Application sees decrypted data
-"""
-
-import ssl
-from asyncio import transports
-from typing import Any, Optional, Callable
-
-__all__ = ['SSLTransport', 'create_ssl_transport']
-
-
-class SSLTransport(transports.Transport):
-    """SSL transport using ssl.MemoryBIO for encryption.
-
-    This transport wraps a raw transport and provides transparent
-    SSL/TLS encryption using Python's ssl module with MemoryBIO.
-
-    The key insight is that MemoryBIO allows us to do SSL encryption
-    without requiring a real socket file descriptor, which works
-    perfectly with Erlang's enif_select model.
-    """
-
-    max_size = 256 * 1024  # 256 KB
-
-    def __init__(self, loop, raw_transport, protocol, ssl_context,
-                 server_hostname=None, server_side=False,
-                 ssl_handshake_timeout=None, call_connection_made=True):
-        """Initialize the SSL transport.
-
-        Args:
-            loop: The event loop.
-            raw_transport: The underlying raw transport.
-            protocol: The application protocol.
-            ssl_context: SSL context for encryption.
-            server_hostname: Hostname for SNI (client side).
-            server_side: True if this is a server connection.
-            ssl_handshake_timeout: Timeout for the SSL handshake.
-            call_connection_made: Whether to call connection_made.
-        """
-        self._loop = loop
-        self._raw_transport = raw_transport
-        self._protocol = protocol
-        self._ssl_context = ssl_context
-        self._server_hostname = server_hostname
-        self._server_side = server_side
-        self._handshake_timeout = ssl_handshake_timeout
-        self._call_connection_made = call_connection_made
-
-        # SSL state
-        self._incoming = ssl.MemoryBIO()
-        self._outgoing = ssl.MemoryBIO()
-        self._ssl_object = ssl_context.wrap_bio(
-            self._incoming, self._outgoing,
-            server_side=server_side,
-            server_hostname=server_hostname
-        )
-
-        # State flags
-        self._handshake_complete = False
-        self._closing = False
-        self._closed = False
-        self._write_buffer = []
-
-        # Extra info
-        self._extra = {
-            'ssl_context': ssl_context,
-        }
-
-        # Create a protocol that receives raw data
-        self._raw_protocol = _SSLRawProtocol(self)
-
-    async def _start(self):
-        """Start the SSL transport and perform handshake."""
-        # Replace the raw transport's protocol with ours
-        self._raw_transport._protocol = self._raw_protocol
-
-        # Perform SSL handshake
-        await self._do_handshake()
-
-        # Update extra info
-        self._extra['peercert'] = self._ssl_object.getpeercert()
-        self._extra['cipher'] = self._ssl_object.cipher()
-        self._extra['compression'] = self._ssl_object.compression()
-        self._extra['ssl_object'] = self._ssl_object
-
-        # Notify application protocol
-        if self._call_connection_made:
-            self._loop.call_soon(self._protocol.connection_made, self)
-
-    async def _do_handshake(self):
-        """Perform SSL handshake."""
-        while not self._handshake_complete:
-            try:
-                self._ssl_object.do_handshake()
-                self._handshake_complete = True
-            except ssl.SSLWantReadError:
-                # Need to send data and receive more
-                self._flush_outgoing()
-                await self._wait_for_data()
-            except ssl.SSLWantWriteError:
-                # Need to send buffered data
-                self._flush_outgoing()
-
-    def _flush_outgoing(self):
-        """Flush outgoing encrypted data to raw transport."""
-        data = self._outgoing.read()
-        if data:
-            self._raw_transport.write(data)
-
-    async def _wait_for_data(self):
-        """Wait for data from the raw transport."""
-        fut = self._loop.create_future()
-
-        def on_data():
-            if not fut.done():
-                fut.set_result(None)
-
-        self._raw_protocol._read_waiter = on_data
-        try:
-            await fut
-        finally:
-            self._raw_protocol._read_waiter = None
-
-    def _on_raw_data(self, data: bytes):
-        """Called when raw encrypted data is received.
-
-        Args:
-            data: Encrypted data from the network.
-        """
-        self._incoming.write(data)
-
-        if not self._handshake_complete:
-            # Still handshaking, notify waiter
-            return
-
-        # Decrypt and deliver to application
-        try:
-            while True:
-                chunk = self._ssl_object.read(self.max_size)
-                if chunk:
-                    self._protocol.data_received(chunk)
-                else:
-                    break
-        except ssl.SSLWantReadError:
-            pass
-        except ssl.SSLError as e:
-            self._fatal_error(e, 'SSL read error')
-
-    def _on_raw_eof(self):
-        """Called when the raw transport receives EOF."""
-        try:
-            self._ssl_object.unwrap()
-        except ssl.SSLError:
-            pass
-
-        if hasattr(self._protocol, 'eof_received'):
-            self._protocol.eof_received()
-
-    def write(self, data: bytes):
-        """Write data to the transport.
-
-        Args:
-            data: Plaintext data to send.
-        """
-        if self._closing or self._closed:
-            return
-        if not data:
-            return
-
-        if not self._handshake_complete:
-            self._write_buffer.append(data)
-            return
-
-        try:
-            self._ssl_object.write(data)
-            self._flush_outgoing()
-        except ssl.SSLError as e:
-            self._fatal_error(e, 'SSL write error')
-
-    def writelines(self, list_of_data):
-        """Write a list of data items."""
-        for data in list_of_data:
-            self.write(data)
-
-    def write_eof(self):
-        """Close the write end of the transport."""
-        if self._closing:
-            return
-        self._closing = True
-
-        try:
-            self._ssl_object.unwrap()
-            self._flush_outgoing()
-        except ssl.SSLError:
-            pass
-
-        self._raw_transport.write_eof()
-
-    def can_write_eof(self):
-        return False
-
-    def close(self):
-        """Close the transport."""
-        if self._closed:
-            return
-        self._closing = True
-
-        try:
-            self._ssl_object.unwrap()
-            self._flush_outgoing()
-        except ssl.SSLError:
-            pass
-
-        self._raw_transport.close()
-        self._closed = True
-
-    def is_closing(self):
-        return self._closing
-
-    def abort(self):
-        """Close immediately without flushing."""
-        self._closed = True
-        self._raw_transport.abort()
-
-    def get_extra_info(self, name, default=None):
-        if name in self._extra:
-            return self._extra[name]
-        return self._raw_transport.get_extra_info(name, default)
-
-    def get_write_buffer_size(self):
-        return self._raw_transport.get_write_buffer_size()
-
-    def get_write_buffer_limits(self):
-        return self._raw_transport.get_write_buffer_limits()
-
-    def set_write_buffer_limits(self, high=None, low=None):
-        self._raw_transport.set_write_buffer_limits(high, low)
-
-    def pause_reading(self):
-        self._raw_transport.pause_reading()
-
-    def resume_reading(self):
-        self._raw_transport.resume_reading()
-
-    def is_reading(self):
-        return self._raw_transport.is_reading()
-
-    def _fatal_error(self, exc, message='Fatal SSL error'):
-        """Handle fatal SSL errors."""
-        self._loop.call_exception_handler({
-            'message': message,
-            'exception': exc,
-            'transport': self,
-            'protocol': self._protocol,
-        })
-        self.abort()
-
-
-class _SSLRawProtocol:
-    """Protocol that receives raw encrypted data for SSLTransport."""
-
-    def __init__(self, ssl_transport):
-        self._ssl_transport = ssl_transport
-        self._read_waiter = None
-
-    def connection_made(self, transport):
-        pass
-
-    def data_received(self, data):
-        self._ssl_transport._on_raw_data(data)
-        if self._read_waiter is not None:
-            self._read_waiter()
-
-    def eof_received(self):
-        self._ssl_transport._on_raw_eof()
-
-    def connection_lost(self, exc):
-        self._ssl_transport._protocol.connection_lost(exc)
-
-
-async def create_ssl_transport(
-        loop, raw_transport, protocol, ssl_context,
-        server_hostname=None, server_side=False,
-        ssl_handshake_timeout=None):
-    """Create an SSL transport wrapping a raw transport.
-
-    Args:
-        loop: The event loop.
-        raw_transport: The underlying raw transport.
-        protocol: The application protocol.
-        ssl_context: SSL context for encryption.
-        server_hostname: Hostname for SNI (client side).
-        server_side: True if this is a server connection.
-        ssl_handshake_timeout: Timeout for the SSL handshake.
-
-    Returns:
-        The SSL transport.
-    """
-    transport = SSLTransport(
-        loop, raw_transport, protocol, ssl_context,
-        server_hostname=server_hostname,
-        server_side=server_side,
-        ssl_handshake_timeout=ssl_handshake_timeout
-    )
-    await transport._start()
-    return transport
diff --git a/scripts/lint_doc_snippets.escript b/scripts/lint_doc_snippets.escript
new file mode 100755
index 0000000..a8f528b
--- /dev/null
+++ b/scripts/lint_doc_snippets.escript
@@ -0,0 +1,257 @@
+#!/usr/bin/env escript
+%% -*- erlang -*-
+%%! -smp enable
+%%
+%% lint_doc_snippets — verifies fenced code blocks in README.md and
+%% docs/*.md.
+%%
+%%   * ```erlang blocks: every `py:Fn(Args)` call must reference a
+%%     function exported from py.erl at the correct arity.
+%%   * ```python blocks: must parse as Python (ast.parse). Anything
+%%     stricter (undefined names, wrong types) is out of scope; we are
+%%     after typos and removed-API references, not full type checking.
+%%   * Skip with `<!-- skip-lint -->` on the line immediately above
+%%     the opening fence. Use sparingly and explain why in the
+%%     surrounding prose.
+%%
+%% Exit code is the number of failures (0 = clean).
+%%
+%% Run: rebar3 compile && escript scripts/lint_doc_snippets.escript
+
+main(Args) ->
+    Root = filename:dirname(filename:dirname(escript:script_name())),
+    EbinDir = filename:join([Root, "_build", "default", "lib",
+                             "erlang_python", "ebin"]),
+    case filelib:is_dir(EbinDir) of
+        true ->
+            true = code:add_path(EbinDir);
+        false ->
+            io:format(standard_error,
+                      "ebin not found at ~s; run `rebar3 compile` first~n",
+                      [EbinDir]),
+            halt(2)
+    end,
+    PyExports = load_py_exports(),
+    Files = case Args of
+        [] -> default_files(Root);
+        _  -> Args
+    end,
+    Failures = lists:foldl(
+        fun(File, Acc) -> Acc + lint_file(File, PyExports) end,
+        0, Files),
+    case Failures of
+        0 ->
+            io:format("lint-docs: clean (~p file(s))~n", [length(Files)]),
+            halt(0);
+        N ->
+            io:format(standard_error,
+                      "lint-docs: ~p failure(s)~n", [N]),
+            halt(N)
+    end.
+
+default_files(Root) ->
+    Readme = filename:join(Root, "README.md"),
+    Docs   = filelib:wildcard(filename:join([Root, "docs", "*.md"])),
+    [Readme | Docs].
+
+load_py_exports() ->
+    case code:ensure_loaded(py) of
+        {module, py} ->
+            sets:from_list(py:module_info(exports));
+        Other ->
+            io:format(standard_error,
+                      "could not load py.beam: ~p~n", [Other]),
+            halt(2)
+    end.
+
+%%% ----------------------------------------------------------------
+%%% per-file walk
+%%% ----------------------------------------------------------------
+
+lint_file(File, PyExports) ->
+    case file:read_file(File) of
+        {ok, Bin} ->
+            Lines = binary:split(Bin, <<"\n">>, [global]),
+            Numbered = lists:zip(lists:seq(1, length(Lines)), Lines),
+            Blocks = collect_blocks(Numbered, <<>>, []),
+            lists:foldl(
+                fun(Block, Acc) -> Acc + lint_block(File, Block, PyExports) end,
+                0, Blocks);
+        {error, Reason} ->
+            io:format(standard_error, "~s: read failed (~p)~n",
+                      [File, Reason]),
+            1
+    end.
+
+%% Walk numbered lines and emit {Lang, StartLine, BodyBin} for each
+%% ```LANG ... ``` block. PrevLine carries the previous line so we can
+%% detect a `<!-- skip-lint -->` marker on the line immediately before
+%% an opening fence.
+collect_blocks([], _Prev, Acc) ->
+    lists:reverse(Acc);
+collect_blocks([{_, Line} | Rest], Prev, Acc) ->
+    case classify(Line) of
+        {open, Lang} ->
+            Skip = is_skip_marker(Prev),
+            {Body, RestAfter, StartLine} =
+                consume_block(Rest, [], Skip, Lang),
+            NewAcc = case Skip of
+                true  -> Acc;
+                false -> [{Lang, StartLine, Body} | Acc]
+            end,
+            collect_blocks(RestAfter, Line, NewAcc);
+        _ ->
+            collect_blocks(Rest, Line, Acc)
+    end.
+
+is_skip_marker(Line) ->
+    Trim = string:trim(Line),
+    Trim =:= <<"<!-- skip-lint -->">>.
+
+classify(Line) ->
+    Trim = string:trim(Line),
+    case Trim of
+        <<"```erlang">> -> {open, erlang};
+        <<"```python">> -> {open, python};
+        <<"```py">>     -> {open, python};
+        <<"```">>       -> close;
+        _               -> other
+    end.
+
+%% Consume lines until the closing ``` fence. Returns {BodyBin, RestLines, FirstLine}.
+consume_block([], BodyAcc, _Skip, _Lang) ->
+    {iolist_to_binary(lists:reverse(BodyAcc)), [], 0};
+consume_block([{LineNo, Line} | Rest], BodyAcc, _Skip, Lang) ->
+    case classify(Line) of
+        close ->
+            {iolist_to_binary(lists:reverse(BodyAcc)), Rest, LineNo};
+        _ ->
+            consume_block(Rest, [<<Line/binary, "\n">> | BodyAcc],
+                          _Skip, Lang)
+    end.
+
+
+%%% ----------------------------------------------------------------
+%%% per-block lint
+%%% ----------------------------------------------------------------
+
+lint_block(File, {erlang, _Line, Body}, PyExports) ->
+    lint_erlang_block(File, Body, PyExports);
+lint_block(File, {python, _Line, Body}, _) ->
+    lint_python_block(File, Body);
+lint_block(_, _, _) ->
+    0.
+
+%% Erlang: scan tokens, look for `py:Fn(...)` patterns and check exports.
+lint_erlang_block(File, Body, PyExports) ->
+    case erl_scan:string(unicode:characters_to_list(Body)) of
+        {ok, Tokens, _} ->
+            check_py_calls(File, Tokens, PyExports);
+        {error, _Info, _Loc} ->
+            %% Erlang shell-style snippets (`1> ...`) and tutorial
+            %% fragments don't tokenise. Don't fail on those — the
+            %% goal is to catch wrong API names, not enforce
+            %% standalone parseability.
+            0
+    end.
+
+%% Walk the token list looking for the pattern:
+%%     atom(py)  ':'  atom(Fn)  '('  ...  ')'
+%% The counter consumes tokens up to and including the matching `)`.
+check_py_calls(_File, [], _Exports) -> 0;
+check_py_calls(File, [{atom, _, py}, {':', _}, {atom, Loc, Fn},
+                      {'(', _} | Rest], Exports) ->
+    {Arity, After} = count_top_level_args(Rest, 1, 0, 0),
+    Bad = case sets:is_element({Fn, Arity}, Exports) of
+        true  -> 0;
+        false ->
+            io:format(standard_error,
+                      "~s:~p: undefined or wrong-arity call py:~s/~p~n",
+                      [File, line_of(Loc), atom_to_list(Fn), Arity]),
+            1
+    end,
+    Bad + check_py_calls(File, After, Exports);
+check_py_calls(File, [_ | Rest], Exports) ->
+    check_py_calls(File, Rest, Exports).
+
+line_of({Line, _Col}) -> Line;
+line_of(Line) when is_integer(Line) -> Line;
+line_of(_) -> 0.
+
+%% Count top-level arguments in a function call. Tracks nesting of
+%% `(`, `[`, `{` so commas inside nested expressions are ignored.
+%% Returns {Arity, TokensAfterClosingParen} — the caller advances past
+%% the call before resuming the scan, so a `py:foo(...)` nested inside
+%% another `py:bar(...)` is not double-counted.
+count_top_level_args([{')', _} | Rest], 1, 0, _Args) ->
+    %% Empty arg list `f()`.
+    {0, Rest};
+count_top_level_args([{')', _} | Rest], 1, _Args, ArgsSoFar) ->
+    %% Closing the outer `(` — return arg count.
+    {ArgsSoFar + 1, Rest};
+count_top_level_args([], _Depth, ArgsSeen, ArgsSoFar) ->
+    %% Truncated snippet; best-effort count.
+    {ArgsSoFar + (case ArgsSeen of 0 -> 0; _ -> 1 end), []};
+count_top_level_args([{Tok, _} | Rest], Depth, _Args, ArgsSoFar)
+        when Tok =:= '('; Tok =:= '['; Tok =:= '{'; Tok =:= '<<' ->
+    count_top_level_args(Rest, Depth + 1, 1, ArgsSoFar);
+count_top_level_args([{Tok, _} | Rest], Depth, _Args, ArgsSoFar)
+        when Tok =:= ')'; Tok =:= ']'; Tok =:= '}'; Tok =:= '>>' ->
+    count_top_level_args(Rest, Depth - 1, 1, ArgsSoFar);
+%% `fun`, `case`, `if`, `try`, `receive`, `begin` open an `end`-terminated
+%% block. Treat them as depth brackets so commas inside fun bodies
+%% (which are part of one outer arg) are not counted.
+count_top_level_args([{Kw, _} | Rest], Depth, _Args, ArgsSoFar)
+        when Kw =:= 'fun'; Kw =:= 'case'; Kw =:= 'if'; Kw =:= 'try';
+             Kw =:= 'receive'; Kw =:= 'begin' ->
+    count_top_level_args(Rest, Depth + 1, 1, ArgsSoFar);
+count_top_level_args([{'end', _} | Rest], Depth, _Args, ArgsSoFar) ->
+    count_top_level_args(Rest, Depth - 1, 1, ArgsSoFar);
+count_top_level_args([{',', _} | Rest], 1, _ArgsSeen, ArgsSoFar) ->
+    count_top_level_args(Rest, 1, 0, ArgsSoFar + 1);
+count_top_level_args([_Tok | Rest], Depth, _Args, ArgsSoFar) ->
+    count_top_level_args(Rest, Depth, 1, ArgsSoFar).
+
+%% Python: write the snippet to a temp file and run ast.parse on it.
+%% IndentationError / TabError are tolerated because tutorial-style
+%% fragments often omit outer scope (e.g. show only the body of an
+%% if-block, or a single line of a larger function). SyntaxError still
+%% fails the lint.
+lint_python_block(File, Body) ->
+    TmpDir = case os:getenv("TMPDIR") of
+        false -> "/tmp";
+        Dir   -> Dir
+    end,
+    Tmp = filename:join(TmpDir, "lint_snip_"
+                        ++ integer_to_list(erlang:unique_integer([positive]))
+                        ++ ".py"),
+    ok = file:write_file(Tmp, Body),
+    Script =
+        "import ast,sys\n"
+        "src=open(sys.argv[1]).read()\n"
+        "try:\n"
+        "    ast.parse(src)\n"
+        "except (IndentationError, TabError):\n"
+        "    sys.exit(0)\n"
+        "except SyntaxError as e:\n"
+        "    print(f'SyntaxError: {e}')\n"
+        "    sys.exit(1)\n",
+    Cmd = "python3 -c " ++ shell_quote(Script) ++ " " ++ Tmp
+          ++ " 2>&1; echo __exit_$?__",
+    Out = os:cmd(Cmd),
+    file:delete(Tmp),
+    case re:run(Out, <<"__exit_0__\\s*$">>, [{capture, none}]) of
+        match -> 0;
+        nomatch ->
+            Trimmed = re:replace(Out, <<"__exit_\\d+__\\s*$">>, <<>>,
+                                 [{return, list}]),
+            io:format(standard_error,
+                      "~s: python snippet failed:~n~s~n",
+                      [File, Trimmed]),
+            1
+    end.
+
+%% Single-quote a string for safe inclusion in /bin/sh.
+shell_quote(S) ->
+    Escaped = re:replace(S, "'", "'\\\\''", [global, {return, list}]),
+    "'" ++ Escaped ++ "'".
diff --git a/src/py_util.erl b/src/py_util.erl
index 415e267..5ed4319 100644
--- a/src/py_util.erl
+++ b/src/py_util.erl
@@ -18,15 +18,9 @@
 -module(py_util).
 
 -export([
-    to_binary/1,
-    send_response/3,
-    normalize_timeout/1,
-    normalize_timeout/2
+    to_binary/1
 ]).
 
-%% Default timeout (30 seconds)
--define(DEFAULT_TIMEOUT, 30000).
-
 %%% ============================================================================
 %%% API
 %%% ============================================================================
@@ -39,27 +33,3 @@ to_binary(List) when is_list(List) ->
     list_to_binary(List);
 to_binary(Bin) when is_binary(Bin) ->
     Bin.
-
-%% @doc Send a response to a caller.
--spec send_response(pid(), reference(), {ok, term()} | {error, term()} | ok) -> ok.
-send_response(Caller, Ref, {ok, Value}) ->
-    Caller ! {py_response, Ref, {ok, Value}},
-    ok;
-send_response(Caller, Ref, {error, Error}) ->
-    Caller ! {py_error, Ref, Error},
-    ok;
-send_response(Caller, Ref, ok) ->
-    Caller ! {py_response, Ref, {ok, none}},
-    ok.
-
-%% @doc Normalize a timeout value, returning milliseconds or 0 for infinity.
-%% Uses the default timeout for invalid values.
--spec normalize_timeout(timeout()) -> non_neg_integer().
-normalize_timeout(Timeout) ->
-    normalize_timeout(Timeout, ?DEFAULT_TIMEOUT).
-
-%% @doc Normalize a timeout value with a custom default.
--spec normalize_timeout(timeout(), non_neg_integer()) -> non_neg_integer().
-normalize_timeout(infinity, _Default) -> 0;
-normalize_timeout(Ms, _Default) when is_integer(Ms), Ms > 0 -> Ms;
-normalize_timeout(_, Default) -> Default.
diff --git a/test/coverage_audit.md b/test/coverage_audit.md
new file mode 100644
index 0000000..2502366
--- /dev/null
+++ b/test/coverage_audit.md
@@ -0,0 +1,88 @@
+# Documentation Snippet → Test Coverage Audit
+
+This file maps every public `py:*` and `erlang.*` API shown in the
+README and `docs/*.md` to at least one `*_SUITE.erl` test that
+exercises it. Update this table whenever a documented API is added,
+renamed, or removed.
+
+## Erlang public API (`src/py.erl` exports)
+
+| API | Documented in | Test suite | Test case |
+|---|---|---|---|
+| `py:call/3` | README, getting-started.md, pools.md | `py_SUITE` | `test_call_math`, `test_call_json` |
+| `py:call/4` | README, pools.md | `py_SUITE` | `test_call_kwargs` |
+| `py:call/5` | pools.md | `py_pool_SUITE` | router-bound call tests |
+| `py:cast/3` | README, threading.md | `py_SUITE` | `test_cast` |
+| `py:cast/4` | README | `py_SUITE` | `test_cast_kwargs` |
+| `py:cast/5` | (internal Ctx variant) | `py_SUITE` (indirect) | covered via `test_cast` |
+| `py:eval/1` | README, getting-started.md | `py_SUITE` | `test_eval` |
+| `py:eval/2` | README, getting-started.md | `py_SUITE` | `test_eval`, `test_eval_complex_locals` |
+| `py:eval/3` | README | `py_SUITE` | `test_timeout` |
+| `py:exec/1` | README, getting-started.md | `py_SUITE` | `test_exec` |
+| `py:exec/2` | scalability.md (with Ctx) | `py_context_SUITE` | exec tests |
+| `py:spawn_call/3` | README | `py_SUITE` | `test_spawn_call` |
+| `py:await/1`, `py:await/2` | README | `py_SUITE` | `test_spawn_call` |
+| `py:async_call/3` | README, asyncio.md, threading.md | `py_SUITE` | `test_asyncio_call` |
+| `py:async_await/1`, `py:async_await/2` | README, asyncio.md | `py_SUITE` | `test_asyncio_call` |
+| `py:async_gather/1` | README, asyncio.md | `py_SUITE` | `test_asyncio_gather` |
+| `py:async_gather/2` | (timeout variant) | `py_SUITE` | `test_asyncio_gather_timeout` |
+| `py:parallel/1` | README, scalability.md | `py_SUITE` | `test_parallel_execution` |
+| `py:subinterp_supported/0` | scalability.md | `py_SUITE` | `test_subinterp_supported` |
+| `py:execution_mode/0` | README, scalability.md | `py_SUITE` | `test_execution_mode` |
+| `py:context/0`, `py:context/1` | README, process-bound-envs.md | `py_context_SUITE` | context creation |
+| `py:state_set`, `state_fetch`, `state_incr`, `state_decr`, `state_remove`, `state_keys`, `state_clear` | README, getting-started.md | `py_SUITE` | `test_shared_state` |
+| `py:register_function/2`, `register_function/3` | README, threading.md | `py_SUITE` | `test_erlang_callback`, `test_erlang_callback_mfa` |
+| `py:unregister_function/1` | README | `py_SUITE` | `test_erlang_callback` |
+| `py:stream/3`, `py:stream_eval/1` | README, streaming.md | `py_SUITE` | `test_streaming`, `test_stream_function` |
+| `py:dup_fd/1` | reactor.md | `py_fd_ops_SUITE` | `dup_fd_test` |
+| `py:configure_logging/0,1` | README, logging.md | `py_logging_SUITE` | logging tests |
+| `py:enable_tracing/0`, `disable_tracing/0`, `get_traces/0`, `clear_traces/0` | README, logging.md | `py_logging_SUITE` | tracing tests |
+| `py:activate_venv/1`, `deactivate_venv/0`, `venv_info/0` | README | `py_SUITE` | `test_venv` |
+| `py:memory_stats/0`, `py:gc/0` | README | `py_SUITE` | `test_memory_stats`, `test_gc` |
+| `py:tracemalloc_start/0`, `tracemalloc_stop/0` | README | `py_SUITE` | indirect via memory tests |
+| `py:version/0` | (no doc) | not exercised | — |
+
+## Python public API (`erlang.*` and `from erlang import ...`)
+
+| API | Documented in | Test suite |
+|---|---|---|
+| `erlang.call(name, *args)` | README, threading.md | `py_SUITE` (`test_erlang_callback`) |
+| `erlang.async_call(name, *args)` | threading.md, asyncio.md | `py_async_e2e_SUITE`, `py_thread_callback_SUITE` |
+| `erlang.send(pid, msg)` | README, threading.md | `py_pid_send_SUITE` |
+| `erlang.atom(name)` | (Python-side wrapper) | `py_owngil_features_SUITE`, `py_test_pid_send.py` |
+| `erlang.Pid` | type-conversion.md | `py_pid_send_SUITE` |
+| `erlang.Span`, `erlang.trace()` | logging.md | `py_logging_SUITE` |
+| `erlang.run(main)`, `erlang.new_event_loop()` | asyncio.md | `py_asyncio_compat_SUITE`, `py_event_loop_SUITE` |
+| `erlang.install()` | asyncio.md | `py_asyncio_policy_SUITE` |
+| `erlang.spawn_task()` | asyncio.md | `py_async_task_SUITE` |
+| `erlang.sleep()` | asyncio.md | `py_erlang_sleep_SUITE` |
+| `erlang.SharedDict` | shared-dict.md | `py_SUITE` (`test_shared_dict_*`) |
+| `erlang.Channel`, `erlang.ByteChannel` | channel.md, buffer.md | `py_channel_SUITE`, `py_byte_channel_SUITE` |
+| `erlang.reactor.Protocol` and friends | reactor.md | `py_reactor_SUITE` |
+| `erlang.state_set / state_get / state_delete / state_keys / state_incr / state_decr` (registered callbacks) | README, getting-started.md | `py_SUITE` (`test_shared_state`) |
+
+## Notes for future audits
+
+1. The `py:cast/5` (Ctx, Module, Func, Args, Kwargs) variant is not
+   directly exercised. It is a one-line wrapper around
+   `py_context:call/5` and shares its kwargs marshaling with
+   `py:call/5`. If a future change diverges them, add a dedicated test.
+
+2. `py:version/0` is exported but not documented. Either document it
+   or drop the export.
+
+3. `from erlang import run_command` (`docs/migration.md:426`) and
+   `from erlang import run_shell` (`docs/security.md:103`) refer to
+   user-side registrations, not built-ins. The surrounding prose makes
+   the example pattern explicit; do not mistake these for documented
+   APIs.
+
+4. `priv/_erlang_impl/_ssl.py` was removed in this audit (no importer).
+   Do not reintroduce unless an importer is wired up.
+
+5. `priv/_erlang_impl/_subprocess.py` is intentionally a stub that
+   raises `NotImplementedError` from `create_subprocess_shell` and
+   `create_subprocess_exec`. It is lazy-imported by
+   `priv/_erlang_impl/_loop.py:948,958` when an asyncio caller invokes
+   `loop.subprocess_shell()` / `loop.subprocess_exec()`, which are
+   blocked because `fork()` would corrupt the Erlang VM. Do not delete.
diff --git a/test/py_SUITE.erl b/test/py_SUITE.erl
index 5df9d98..d4b0f0a 100644
--- a/test/py_SUITE.erl
+++ b/test/py_SUITE.erl
@@ -17,6 +17,7 @@
     test_eval_complex_locals/1,
     test_exec/1,
     test_cast/1,
+    test_cast_kwargs/1,
     test_spawn_call/1,
     test_type_conversions/1,
     test_nested_types/1,
@@ -35,6 +36,7 @@
     test_erlang_attr_syntax/1,
     test_asyncio_call/1,
     test_asyncio_gather/1,
+    test_asyncio_gather_timeout/1,
     test_subinterp_supported/1,
     test_parallel_execution/1,
     test_venv/1,
@@ -76,6 +78,7 @@ all() ->
         test_eval_complex_locals,
         test_exec,
         test_cast,
+        test_cast_kwargs,
         test_spawn_call,
         test_type_conversions,
         test_nested_types,
@@ -94,6 +97,7 @@ all() ->
         test_erlang_attr_syntax,
         test_asyncio_call,
         test_asyncio_gather,
+        test_asyncio_gather_timeout,
         test_subinterp_supported,
         test_parallel_execution,
         test_venv,
@@ -230,6 +234,17 @@ test_cast(_Config) ->
     true = lists:sort([R1, R2]) =:= [<<"msg1">>, <<"msg2">>],
     ok.
 
+test_cast_kwargs(_Config) ->
+    %% py:cast/4 must accept and forward a kwargs map without crashing.
+    %% The kwargs marshaling itself is exercised by test_call_kwargs/1; this
+    %% covers the cast/4 wrapper whose only difference from cast/3 is the
+    %% Kwargs map.
+    ok = py:cast(json, dumps, [#{<<"a">> => 1}], #{indent => 2}),
+    ok = py:cast(json, dumps, [[1, 2, 3]], #{separators => [<<", ">>, <<": ">>]}),
+    %% Allow the spawned executors to actually run the calls.
+    timer:sleep(100),
+    ok.
+
 test_spawn_call(_Config) ->
     %% Test spawn_call with await
     Ref1 = py:spawn_call(math, sqrt, [100]),
@@ -592,6 +607,17 @@ test_asyncio_gather(_Config) ->
     {ok, [4.0, 5.0, 6.0]} = py:async_gather(Calls),
     ok.
 
+test_asyncio_gather_timeout(_Config) ->
+    %% py:async_gather/2 with an explicit timeout returns inside the budget
+    %% on success.
+    Calls = [
+        {math, sqrt, [4]},
+        {math, sqrt, [9]},
+        {math, sqrt, [16]}
+    ],
+    {ok, [2.0, 3.0, 4.0]} = py:async_gather(Calls, 5000),
+    ok.
+
 test_subinterp_supported(_Config) ->
     %% Test that subinterp_supported returns a boolean
     Result = py:subinterp_supported(),
diff --git a/test/py_fd_ops_SUITE.erl b/test/py_fd_ops_SUITE.erl
index c4b1873..d305274 100644
--- a/test/py_fd_ops_SUITE.erl
+++ b/test/py_fd_ops_SUITE.erl
@@ -27,7 +27,8 @@
     socketpair_test/1,
     fd_read_write_test/1,
     fd_close_test/1,
-    fd_select_test/1
+    fd_select_test/1,
+    dup_fd_test/1
 ]).
 
 all() ->
@@ -35,7 +36,8 @@ all() ->
         socketpair_test,
         fd_read_write_test,
         fd_close_test,
-        fd_select_test
+        fd_select_test,
+        dup_fd_test
     ].
 
 init_per_suite(Config) ->
@@ -142,3 +144,35 @@ fd_select_test(_Config) ->
     ok = py_nif:fd_close(Fd1),
     ok = py_nif:fd_close(Fd2),
     ok.
+
+%% @doc Test py:dup_fd/1 — duplicates an fd so caller and dup can be
+%% closed independently and both refer to the same kernel descriptor.
+dup_fd_test(_Config) ->
+    {ok, {Fd1, Fd2}} = py_nif:socketpair(),
+
+    %% Duplicate Fd2; the dup must be a different integer.
+    {ok, DupFd2} = py:dup_fd(Fd2),
+    true = is_integer(DupFd2),
+    true = DupFd2 =/= Fd2,
+    true = DupFd2 >= 0,
+
+    %% Write through Fd1 — both Fd2 and DupFd2 see the same data
+    %% because they reference the same socket end.
+    Payload = <<"dup_fd round-trip">>,
+    {ok, _} = py_nif:fd_write(Fd1, Payload),
+    timer:sleep(10),
+    {ok, ReadFromOriginal} = py_nif:fd_read(Fd2, 1024),
+    Payload = ReadFromOriginal,
+
+    %% Close the original fd. The duplicate must remain usable.
+    ok = py_nif:fd_close(Fd2),
+    Payload2 = <<"after-close">>,
+    {ok, _} = py_nif:fd_write(Fd1, Payload2),
+    timer:sleep(10),
+    {ok, ReadFromDup} = py_nif:fd_read(DupFd2, 1024),
+    Payload2 = ReadFromDup,
+
+    %% Clean up
+    ok = py_nif:fd_close(DupFd2),
+    ok = py_nif:fd_close(Fd1),
+    ok.