From 5f197cbbafbe4b7731280521691b0c0fc19f9a3e Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 14:12:29 +0530 Subject: [PATCH 01/19] Improving external SDK for function --- src/datacustomcode/cli.py | 86 +++++- src/datacustomcode/deploy.py | 41 +++ .../function/feature_types/chunking.py | 23 +- src/datacustomcode/function_utils.py | 245 ++++++++++++++++++ src/datacustomcode/run.py | 62 ++++- src/datacustomcode/template.py | 30 ++- src/datacustomcode/templates/__init__.py | 0 .../function/chunking/payload/config.json | 1 + .../function/chunking/payload/entrypoint.py | 74 ++++++ 9 files changed, 532 insertions(+), 30 deletions(-) create mode 100644 src/datacustomcode/function_utils.py create mode 100644 src/datacustomcode/templates/__init__.py create mode 100644 src/datacustomcode/templates/function/chunking/payload/config.json create mode 100644 src/datacustomcode/templates/function/chunking/payload/entrypoint.py diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index c6e9c5c..33a064c 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -74,6 +74,29 @@ def _configure_client_credentials( ) +def _generate_function_test_file(entrypoint_path: str) -> Optional[str]: + """Generate test.json file for a function. + + Args: + entrypoint_path: Path to the function's entrypoint.py + + Returns: + Path to generated test.json, or None if generation failed + """ + from datacustomcode.template import generate_test_json + + tests_dir = os.path.join(os.path.dirname(entrypoint_path), "tests") + os.makedirs(tests_dir, exist_ok=True) + test_json_path = os.path.join(tests_dir, "test.json") + + try: + generate_test_json(entrypoint_path, test_json_path) + return test_json_path + except Exception as e: + logger.warning(f"Could not generate test.json: {e}") + return None + + @cli.command() @click.option("--profile", default="default", help="Credential profile name") @click.option( @@ -162,7 +185,6 @@ def zip(path: str, network: str): Choose based on your workload requirements.""", ) -@click.option("--function-invoke-opt") @click.option( "--sf-cli-org", default=None, @@ -176,13 +198,14 @@ def deploy( cpu_size: str, profile: str, network: str, - function_invoke_opt: str, sf_cli_org: Optional[str], ): from datacustomcode.deploy import ( COMPUTE_TYPES, CodeExtensionMetadata, + USE_IN_FEATURE_MAPPING_FOR_CONNECT_API, deploy_full, + infer_use_in_feature, ) from datacustomcode.token_provider import ( CredentialsTokenProvider, @@ -211,15 +234,21 @@ def deploy( ) if package_type == "function": - if not function_invoke_opt: + # Infer use_in_feature from function signature + entrypoint_path = os.path.join(path, "entrypoint.py") + use_in_feature = infer_use_in_feature(entrypoint_path) + if use_in_feature: + logger.info(f"Inferred use_in_feature: {use_in_feature}") + else: click.secho( - "Error: Function invoke options are required for function package type", + "Error: Could not infer function invoke options. Please provide --use-in-feature", fg="red", ) raise click.Abort() - else: - function_invoke_options = function_invoke_opt.split(",") - metadata.functionInvokeOptions = function_invoke_options + + # Map user-provided feature names to API names + mapped_feature = USE_IN_FEATURE_MAPPING_FOR_CONNECT_API.get(use_in_feature, use_in_feature) + metadata.functionInvokeOptions = [mapped_feature] try: if sf_cli_org: @@ -238,19 +267,29 @@ def deploy( @click.option( "--code-type", default="script", type=click.Choice(["script", "function"]) ) -def init(directory: str, code_type: str): +@click.option( + "--use-in-feature", + default="SearchIndexChunking", + help="Feature to invoke the function (only applicable for functions). If not provided, will be inferred from function signature.", +) +def init(directory: str, code_type: str, use_in_feature: Optional[str]): from datacustomcode.scan import ( dc_config_json_from_file, update_config, write_sdk_config, ) - from datacustomcode.template import copy_function_template, copy_script_template + from datacustomcode.template import ( + copy_function_template, + copy_script_template, + ) click.echo("Copying template to " + click.style(directory, fg="blue", bold=True)) if code_type == "script": copy_script_template(directory) elif code_type == "function": - copy_function_template(directory) + # Default to SearchIndexChunking if not provided + feature = use_in_feature + copy_function_template(directory, feature) entrypoint_path = os.path.join(directory, "payload", "entrypoint.py") config_location = os.path.join(os.path.dirname(entrypoint_path), "config.json") @@ -265,6 +304,7 @@ def init(directory: str, code_type: str): updated_config_json = update_config(entrypoint_path) with open(config_location, "w") as f: json.dump(updated_config_json, f, indent=2) + click.echo( "Start developing by updating the code in " + click.style(entrypoint_path, fg="blue", bold=True) @@ -275,6 +315,23 @@ def init(directory: str, code_type: str): + " to automatically update config.json when you make changes to your code" ) + # Generate test.json for functions + if code_type == "function": + test_json_path = _generate_function_test_file(entrypoint_path) + if test_json_path: + click.echo( + "Generated test file at " + + click.style(test_json_path, fg="blue", bold=True) + ) + click.echo( + "Test your function locally with " + + click.style( + f"datacustomcode run {entrypoint_path} --test_with {test_json_path}", + fg="blue", + bold=True, + ) + ) + @cli.command() @click.argument("filename") @@ -312,6 +369,12 @@ def scan(filename: str, config: str, dry_run: bool, no_requirements: bool): @click.option("--config-file", default=None) @click.option("--dependencies", default=[], multiple=True) @click.option("--profile", default="default") +@click.option( + "--test_with", + default=None, + type=click.Path(exists=True), + help="Path to test JSON file for function testing", +) @click.option( "--sf-cli-org", default=None, @@ -322,10 +385,11 @@ def run( config_file: Union[str, None], dependencies: List[str], profile: str, + test_with: Optional[str], sf_cli_org: Optional[str], ): from datacustomcode.run import run_entrypoint run_entrypoint( - entrypoint, config_file, dependencies, profile, sf_cli_org=sf_cli_org + entrypoint, config_file, dependencies, profile, test_file=test_with, sf_cli_org=sf_cli_org ) diff --git a/src/datacustomcode/deploy.py b/src/datacustomcode/deploy.py index 114252a..4249a59 100644 --- a/src/datacustomcode/deploy.py +++ b/src/datacustomcode/deploy.py @@ -65,6 +65,47 @@ def _sanitize_api_name(name: str) -> str: return sanitized +# Mapping from user-facing feature names to internal API names +USE_IN_FEATURE_MAPPING_FOR_CONNECT_API = { + "SearchIndexChunking": "UnstructuredChunking", +} + +# Mapping from Pydantic request/response types to feature names +REQUEST_TYPE_TO_FEATURE = { + "SearchIndexChunkingV1Request": "SearchIndexChunking", + "SearchIndexChunkingV1Response": "SearchIndexChunking", +} + +def infer_use_in_feature(entrypoint_path: str) -> Union[str, None]: + """Infer the use_in_feature from function signature. + + Checks both the request parameter type and return type annotation. + Both must map to the same feature for a valid inference. + + Args: + entrypoint_path: Path to the entrypoint.py file + + Returns: + The feature name if both request and response match, None otherwise + """ + from datacustomcode.function_utils import inspect_function_types + + request_type_name, response_type_name = inspect_function_types(entrypoint_path) + + if not request_type_name or not response_type_name: + return None + + # Look up features for both types + request_feature = REQUEST_TYPE_TO_FEATURE.get(request_type_name) + response_feature = REQUEST_TYPE_TO_FEATURE.get(response_type_name) + + # Both must be present and must match + if request_feature and response_feature and request_feature == response_feature: + return request_feature + + return None + + class CodeExtensionMetadata(BaseModel): name: str version: str diff --git a/src/datacustomcode/function/feature_types/chunking.py b/src/datacustomcode/function/feature_types/chunking.py index bdf0d91..53b9860 100644 --- a/src/datacustomcode/function/feature_types/chunking.py +++ b/src/datacustomcode/function/feature_types/chunking.py @@ -28,7 +28,7 @@ from pydantic import BaseModel, Field -class DocElement(BaseModel): +class SearchIndexDocElement(BaseModel): """Document element to be chunked""" text: str = Field(..., description="Text content to be chunked") @@ -37,7 +37,7 @@ class DocElement(BaseModel): ) -class ChunkOutput(BaseModel): +class SearchIndexChunkOutput(BaseModel): """Output chunk from the chunking process""" chunk_id: str = Field(..., description="UUID for this chunk") @@ -55,20 +55,17 @@ class ChunkOutput(BaseModel): ) -class StatusResponse(BaseModel): +class SearchIndexStatusResponse(BaseModel): """Status response for operation""" status_type: str = Field(..., description="'success' or 'error'") status_message: str = Field(..., description="Human-readable status") -class UdsChunkingV1BatchRequest(BaseModel): +class SearchIndexChunkingV1Request(BaseModel): """Batch request for UDS chunking""" - version: Literal["v1"] = Field( - default="v1", description="API version, must be 'v1'" - ) - input: List[DocElement] = Field( + input: List[SearchIndexDocElement] = Field( ..., min_length=1, description="List of documents (min 1)" ) max_characters: int = Field(..., description="Max chars per chunk (default: 100)") @@ -77,13 +74,9 @@ class UdsChunkingV1BatchRequest(BaseModel): ) -class UdsChunkingV1BatchResponse(BaseModel): +class SearchIndexChunkingV1Response(BaseModel): """Batch response for UDS chunking""" - - version: Literal["v1"] = Field( - default="v1", description="API version, must be 'v1'" - ) - output: List[ChunkOutput] = Field( + output: List[SearchIndexChunkOutput] = Field( default_factory=list, description="Flat list of chunks from all docs" ) - status: StatusResponse = Field(..., description="Overall operation status") + status: SearchIndexStatusResponse = Field(..., description="Overall operation status") diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py new file mode 100644 index 0000000..18d623c --- /dev/null +++ b/src/datacustomcode/function_utils.py @@ -0,0 +1,245 @@ +# Copyright (c) 2025, Salesforce, Inc. +# SPDX-License-Identifier: Apache-2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for inspecting and working with function entrypoints.""" + +import importlib.util +import inspect +import json +import sys +import typing +from typing import Any, Optional, Tuple + + +def load_function_module(entrypoint_path: str, module_name: str = "function_module"): + """Load a function entrypoint as a Python module. + + Args: + entrypoint_path: Path to the entrypoint.py file + module_name: Name to assign to the module + + Returns: + The loaded module + + Raises: + ImportError: If the module cannot be loaded + """ + spec = importlib.util.spec_from_file_location(module_name, entrypoint_path) + if spec is None or spec.loader is None: + raise ImportError(f"Could not load module from {entrypoint_path}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def get_function_callable(module): + """Get the 'function' callable from a module. + + Args: + module: The module to extract the function from + + Returns: + The function callable + + Raises: + AttributeError: If module doesn't have a 'function' attribute + """ + if not hasattr(module, "function"): + raise AttributeError(f"Module does not have a 'function' callable") + return module.function + + +def get_type_name(type_annotation: Any) -> Optional[str]: + """Extract the type name from a type annotation. + + Args: + type_annotation: A type annotation object + + Returns: + The type name as a string, or None if it cannot be determined + """ + if type_annotation == inspect.Parameter.empty: + return None + + if hasattr(type_annotation, "__name__"): + return type_annotation.__name__ + + return str(type_annotation) + + +def get_function_signature_types( + function_callable, +) -> Tuple[Optional[Any], Optional[Any], Optional[str], Optional[str]]: + """Extract request and response types from a function signature. + + Args: + function_callable: The function to inspect + + Returns: + Tuple of (request_type, response_type, request_type_name, response_type_name) + Any of these can be None if not found + """ + sig = inspect.signature(function_callable) + params = list(sig.parameters.values()) + + request_type = None + request_type_name = None + if len(params) >= 1: + request_type = params[0].annotation + request_type_name = get_type_name(request_type) + + response_type = sig.return_annotation + response_type_name = get_type_name(response_type) + + return request_type, response_type, request_type_name, response_type_name + + +def inspect_function_types( + entrypoint_path: str, +) -> Tuple[Optional[str], Optional[str]]: + """Inspect a function entrypoint and extract type names. + + Args: + entrypoint_path: Path to the entrypoint.py file + + Returns: + Tuple of (request_type_name, response_type_name) + Either can be None if not found or on error + + Example: + >>> request_name, response_name = inspect_function_types("payload/entrypoint.py") + >>> print(request_name) # "SearchIndexChunkingV1Request" + >>> print(response_name) # "SearchIndexChunkingV1Response" + """ + try: + module = load_function_module(entrypoint_path, "temp_module") + function_callable = get_function_callable(module) + _, _, request_type_name, response_type_name = get_function_signature_types( + function_callable + ) + return request_type_name, response_type_name + except Exception: + return None, None + + +def get_request_type(entrypoint_path: str) -> Optional[Any]: + """Get the request type annotation from a function entrypoint. + + Args: + entrypoint_path: Path to the entrypoint.py file + + Returns: + The request type (Pydantic model class), or None if not found + + Raises: + ImportError: If the module cannot be loaded + AttributeError: If the function is not found + ValueError: If the function signature is invalid + """ + module = load_function_module(entrypoint_path) + function_callable = get_function_callable(module) + + sig = inspect.signature(function_callable) + params = list(sig.parameters.values()) + + if len(params) < 1: + raise ValueError("Function must accept at least one parameter (request)") + + request_type = params[0].annotation + if request_type == inspect.Parameter.empty: + raise ValueError("Function request parameter must have a type annotation") + + return request_type + + +def generate_sample_value(field_type, field_name: str): + """Generate a sample value based on field type. + + Args: + field_type: The type annotation of the field + field_name: The name of the field (used for contextual sample generation) + + Returns: + A sample value appropriate for the field type + """ + origin = typing.get_origin(field_type) + + if origin is list or field_type is list: + args = typing.get_args(field_type) + if args: + return [generate_sample_value(args[0], field_name)] + return [] + elif origin is dict or field_type is dict: + return {} + elif field_type is str or origin is typing.Literal: + if "version" in field_name.lower(): + return "v1" + return f"sample_{field_name}" + elif field_type is int: + if "max" in field_name.lower() or "characters" in field_name.lower(): + return 100 + return 1 + elif field_type is float: + return 1.0 + elif field_type is bool: + return True + elif hasattr(field_type, "model_fields"): + # Nested Pydantic model + nested_data = {} + for nested_field_name, nested_field_info in field_type.model_fields.items(): + if nested_field_info.is_required(): + nested_data[nested_field_name] = generate_sample_value( + nested_field_info.annotation, nested_field_name + ) + return nested_data + else: + return None + + +def generate_test_json(entrypoint_path: str, output_path: str) -> None: + """Generate a sample test.json file for a function. + + Args: + entrypoint_path: Path to the function entrypoint.py + output_path: Output path for test.json + + Raises: + ImportError: If the module cannot be loaded + AttributeError: If the function is not found + ValueError: If the request type is not a Pydantic model + """ + # Get the request type + request_type = get_request_type(entrypoint_path) + + # Check if it's a Pydantic model + if not hasattr(request_type, "model_fields"): + raise ValueError(f"Request parameter type must be a Pydantic model") + + # Generate sample data + sample_data = {} + for field_name, field_info in request_type.model_fields.items(): + if field_info.is_required(): + sample_data[field_name] = generate_sample_value( + field_info.annotation, field_name + ) + elif field_info.default is not None: + sample_data[field_name] = field_info.default + + sample_instance = request_type(**sample_data) + + # Write to file + with open(output_path, "w") as f: + json.dump(sample_instance.model_dump(), f, indent=2) \ No newline at end of file diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index 0e5052a..cd6cc75 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -70,6 +70,7 @@ def run_entrypoint( config_file: Union[str, None], dependencies: List[str], profile: str, + test_file: Optional[str] = None, sf_cli_org: Optional[str] = None, ) -> None: """Run the entrypoint for script or function with the given config and dependencies. @@ -79,6 +80,7 @@ def run_entrypoint( config_file: The config file to use. dependencies: The dependencies to import. profile: The credentials profile to use. + test_file: Optional test JSON file for function testing. sf_cli_org: Optional SF CLI org alias or username. If provided, credentials are fetched via `sf org display` instead of from credentials.ini. """ @@ -138,7 +140,65 @@ def run_entrypoint( raise exc except (ModuleNotFoundError, AttributeError) as inner_exc: raise inner_exc from exc - runpy.run_path(entrypoint, init_globals=globals(), run_name="__main__") + + # Handle test file for functions + if test_file and package_type == "function": + run_function_with_test(entrypoint, test_file) + else: + runpy.run_path(entrypoint, init_globals=globals(), run_name="__main__") + + +def run_function_with_test(entrypoint: str, test_file: str) -> None: + """Run a function with test data from a JSON file. + + Dependencies are already loaded by this point, so we just import + the entrypoint module and call the function directly. + + Args: + entrypoint: Path to the function entrypoint.py + test_file: Path to test JSON file containing request data + """ + from datacustomcode.function_utils import ( + get_function_callable, + get_request_type, + load_function_module, + ) + + # Import the entrypoint module in the current environment (with all dependencies loaded) + module = load_function_module(entrypoint, "entrypoint_module") + function_callable = get_function_callable(module) + request_type = get_request_type(entrypoint) + + # Load and parse the test JSON + with open(test_file, "r") as f: + test_data = json.load(f) + + # Use Pydantic to parse and validate the request + try: + request = request_type(**test_data) + except Exception as e: + raise ValueError(f"Failed to parse test data as {request_type.__name__}: {e}") from e + + # Import Runtime + from datacustomcode.function import Runtime + + # Call the function with test data + print(f"Running function with test data from {test_file}...") + result = function_callable(request, Runtime()) + + # Pretty print the result + print("\n" + "=" * 80) + print("RESULT:") + print("=" * 80) + if hasattr(result, "model_dump"): + # Pydantic v2 + print(json.dumps(result.model_dump(), indent=2)) + elif hasattr(result, "dict"): + # Pydantic v1 + print(json.dumps(result.dict(), indent=2)) + else: + print(result) + print("=" * 80) def add_py_folder(entrypoint: str): diff --git a/src/datacustomcode/template.py b/src/datacustomcode/template.py index 195d4a2..424cfb6 100644 --- a/src/datacustomcode/template.py +++ b/src/datacustomcode/template.py @@ -37,11 +37,21 @@ def copy_script_template(target_dir: str) -> None: shutil.copy2(source, destination) -def copy_function_template(target_dir: str) -> None: +MAPPED_FOLDER = {"SearchIndexChunking": "chunking"} + + +def copy_function_template(target_dir: str, use_in_feature: str) -> None: os.makedirs(target_dir, exist_ok=True) - for item in os.listdir(function_template_dir): - source = os.path.join(function_template_dir, item) + if use_in_feature and use_in_feature in MAPPED_FOLDER: + feature_function_template_dir = os.path.join( + function_template_dir, MAPPED_FOLDER[use_in_feature] + ) + else: + feature_function_template_dir = function_template_dir + + for item in os.listdir(feature_function_template_dir): + source = os.path.join(feature_function_template_dir, item) destination = os.path.join(target_dir, item) if os.path.isdir(source): @@ -50,3 +60,17 @@ def copy_function_template(target_dir: str) -> None: else: logger.debug(f"Copying file {source} to {destination}...") shutil.copy2(source, destination) + + +# Re-export generate_test_json from function_utils for backwards compatibility +def generate_test_json(entrypoint_path: str, output_path: str) -> None: + """Generate a sample test.json file for a function. + + Args: + entrypoint_path: Path to the function entrypoint.py + output_path: Output path for test.json + """ + from datacustomcode.function_utils import generate_test_json as _generate_test_json + + _generate_test_json(entrypoint_path, output_path) + logger.debug(f"Generated test JSON at {output_path}") diff --git a/src/datacustomcode/templates/__init__.py b/src/datacustomcode/templates/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/datacustomcode/templates/function/chunking/payload/config.json b/src/datacustomcode/templates/function/chunking/payload/config.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/src/datacustomcode/templates/function/chunking/payload/config.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py new file mode 100644 index 0000000..d6be950 --- /dev/null +++ b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py @@ -0,0 +1,74 @@ +import logging + +from datacustomcode.function import Runtime + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +from datacustomcode.function.feature_types.chunking import ( + SearchIndexChunkingV1Request, + SearchIndexChunkingV1Response, + SearchIndexChunkOutput, + SearchIndexStatusResponse +) + + +def function(request: SearchIndexChunkingV1Request, runtime: Runtime) -> SearchIndexChunkingV1Response: + print(f"Received {len(request.input)} documents to chunk") + print(f"Max characters per chunk: {request.max_characters}") + + chunks = [] + chunk_id = 1 + + # Process each document + for doc_idx, doc in enumerate(request.input): + # Access fields - works identically in both Pydantic and betterproto! + text = doc.text + metadata = doc.metadata if hasattr(doc.metadata, '__iter__') else {} + + print(f"📄 Processing document {doc_idx + 1}: {len(text)} characters") + + # Chunk the text + max_chars = request.max_characters + chunk_start = 0 + + while chunk_start < len(text): + chunk_end = min(chunk_start + max_chars, len(text)) + chunk_text = text[chunk_start:chunk_end] + + # Try to break at word boundary if not at end + if chunk_end < len(text) and not text[chunk_end].isspace(): + # Look for last space in chunk + last_space = chunk_text.rfind(' ') + if last_space > max_chars * 0.8: # Only if space is in last 20% + chunk_end = chunk_start + last_space + chunk_text = text[chunk_start:chunk_end] + + + # Create ChunkOutput object + chunk_output = SearchIndexChunkOutput( + chunk_id=f"chunk_{chunk_id:04d}", + chunk_type="text", + text=chunk_text.strip(), + seq_no=chunk_id, + metadata={k: str(v) for k, v in (dict(metadata) if metadata else {}).items()}, + tag_metadata={}, + citations={} + ) + chunks.append(chunk_output) + + print(f" ✂️ Chunk {chunk_id}: {len(chunk_text)} chars") + chunk_id += 1 + chunk_start = chunk_end + + print(f"✅ Generated {len(chunks)} chunks total") + + # Return UdsChunkingV1BatchResponse object + return SearchIndexChunkingV1Response( + output=chunks, + status=SearchIndexStatusResponse( + status_type="success", + status_message=f"Successfully chunked {len(request.input)} documents into {len(chunks)} chunks" + ) + ) From 804d08456278593aec31dd502f22b8949cf72c74 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 14:24:45 +0530 Subject: [PATCH 02/19] Improving external SDK for function --- src/datacustomcode/cli.py | 3 ++- src/datacustomcode/run.py | 4 ---- src/datacustomcode/template.py | 12 ------------ 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index 33a064c..94bc95a 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -83,7 +83,7 @@ def _generate_function_test_file(entrypoint_path: str) -> Optional[str]: Returns: Path to generated test.json, or None if generation failed """ - from datacustomcode.template import generate_test_json + from datacustomcode.function_utils import generate_test_json tests_dir = os.path.join(os.path.dirname(entrypoint_path), "tests") os.makedirs(tests_dir, exist_ok=True) @@ -91,6 +91,7 @@ def _generate_function_test_file(entrypoint_path: str) -> Optional[str]: try: generate_test_json(entrypoint_path, test_json_path) + logger.debug(f"Generated test JSON at {test_json_path}") return test_json_path except Exception as e: logger.warning(f"Could not generate test.json: {e}") diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index cd6cc75..605bc00 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -191,11 +191,7 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None: print("RESULT:") print("=" * 80) if hasattr(result, "model_dump"): - # Pydantic v2 print(json.dumps(result.model_dump(), indent=2)) - elif hasattr(result, "dict"): - # Pydantic v1 - print(json.dumps(result.dict(), indent=2)) else: print(result) print("=" * 80) diff --git a/src/datacustomcode/template.py b/src/datacustomcode/template.py index 424cfb6..15c1e11 100644 --- a/src/datacustomcode/template.py +++ b/src/datacustomcode/template.py @@ -62,15 +62,3 @@ def copy_function_template(target_dir: str, use_in_feature: str) -> None: shutil.copy2(source, destination) -# Re-export generate_test_json from function_utils for backwards compatibility -def generate_test_json(entrypoint_path: str, output_path: str) -> None: - """Generate a sample test.json file for a function. - - Args: - entrypoint_path: Path to the function entrypoint.py - output_path: Output path for test.json - """ - from datacustomcode.function_utils import generate_test_json as _generate_test_json - - _generate_test_json(entrypoint_path, output_path) - logger.debug(f"Generated test JSON at {output_path}") From 7be51c3e7d137ff75f7934e396efd5fa604f6ca7 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 14:39:08 +0530 Subject: [PATCH 03/19] Improving external SDK for function --- src/datacustomcode/function_utils.py | 46 +++++++++++++++++----------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py index 18d623c..fe0727e 100644 --- a/src/datacustomcode/function_utils.py +++ b/src/datacustomcode/function_utils.py @@ -165,6 +165,30 @@ def get_request_type(entrypoint_path: str) -> Optional[Any]: return request_type +def _generate_model_sample_data(model_type): + """Generate sample data for all fields in a Pydantic model. + + Args: + model_type: A Pydantic model class + + Returns: + Dictionary with sample data for all fields + """ + from pydantic_core import PydanticUndefined + + sample_data = {} + for field_name, field_info in model_type.model_fields.items(): + # Check if field has a real default value + if field_info.default is not PydanticUndefined: + sample_data[field_name] = field_info.default + else: + # Required field or field without default - generate sample + sample_data[field_name] = generate_sample_value( + field_info.annotation, field_name + ) + return sample_data + + def generate_sample_value(field_type, field_name: str): """Generate a sample value based on field type. @@ -197,14 +221,8 @@ def generate_sample_value(field_type, field_name: str): elif field_type is bool: return True elif hasattr(field_type, "model_fields"): - # Nested Pydantic model - nested_data = {} - for nested_field_name, nested_field_info in field_type.model_fields.items(): - if nested_field_info.is_required(): - nested_data[nested_field_name] = generate_sample_value( - nested_field_info.annotation, nested_field_name - ) - return nested_data + # Nested Pydantic model - use shared helper + return _generate_model_sample_data(field_type) else: return None @@ -228,16 +246,8 @@ def generate_test_json(entrypoint_path: str, output_path: str) -> None: if not hasattr(request_type, "model_fields"): raise ValueError(f"Request parameter type must be a Pydantic model") - # Generate sample data - sample_data = {} - for field_name, field_info in request_type.model_fields.items(): - if field_info.is_required(): - sample_data[field_name] = generate_sample_value( - field_info.annotation, field_name - ) - elif field_info.default is not None: - sample_data[field_name] = field_info.default - + # Generate sample data for ALL fields (use defaults where available) + sample_data = _generate_model_sample_data(request_type) sample_instance = request_type(**sample_data) # Write to file From 89ab4bc9b1337253512c1aedc9691835296398bd Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 17:12:38 +0530 Subject: [PATCH 04/19] Improving external SDK for function --- src/datacustomcode/cli.py | 23 ++-- src/datacustomcode/constants.py | 45 +++++++ src/datacustomcode/deploy.py | 21 ++-- src/datacustomcode/function_utils.py | 110 +++++++++++++++++- src/datacustomcode/template.py | 38 ++++-- .../function/chunking/payload/entrypoint.py | 61 +++++----- 6 files changed, 227 insertions(+), 71 deletions(-) create mode 100644 src/datacustomcode/constants.py diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index 94bc95a..78eaa74 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -27,6 +27,13 @@ from datacustomcode import AuthType from datacustomcode.auth import configure_oauth_tokens +from datacustomcode.constants import ( + CONFIG_FILE, + ENTRYPOINT_FILE, + PAYLOAD_DIR, + TEST_FILE, + TESTS_DIR, +) from datacustomcode.scan import find_base_directory, get_package_type @@ -85,9 +92,9 @@ def _generate_function_test_file(entrypoint_path: str) -> Optional[str]: """ from datacustomcode.function_utils import generate_test_json - tests_dir = os.path.join(os.path.dirname(entrypoint_path), "tests") + tests_dir = os.path.join(os.path.dirname(entrypoint_path), TESTS_DIR) os.makedirs(tests_dir, exist_ok=True) - test_json_path = os.path.join(tests_dir, "test.json") + test_json_path = os.path.join(tests_dir, TEST_FILE) try: generate_test_json(entrypoint_path, test_json_path) @@ -236,7 +243,7 @@ def deploy( if package_type == "function": # Infer use_in_feature from function signature - entrypoint_path = os.path.join(path, "entrypoint.py") + entrypoint_path = os.path.join(path, ENTRYPOINT_FILE) use_in_feature = infer_use_in_feature(entrypoint_path) if use_in_feature: logger.info(f"Inferred use_in_feature: {use_in_feature}") @@ -288,11 +295,9 @@ def init(directory: str, code_type: str, use_in_feature: Optional[str]): if code_type == "script": copy_script_template(directory) elif code_type == "function": - # Default to SearchIndexChunking if not provided - feature = use_in_feature - copy_function_template(directory, feature) - entrypoint_path = os.path.join(directory, "payload", "entrypoint.py") - config_location = os.path.join(os.path.dirname(entrypoint_path), "config.json") + copy_function_template(directory, use_in_feature) + entrypoint_path = os.path.join(directory, PAYLOAD_DIR, ENTRYPOINT_FILE) + config_location = os.path.join(os.path.dirname(entrypoint_path), CONFIG_FILE) # Write package type to SDK-specific config sdk_config = {"type": code_type} @@ -344,7 +349,7 @@ def init(directory: str, code_type: str, use_in_feature: Optional[str]): def scan(filename: str, config: str, dry_run: bool, no_requirements: bool): from datacustomcode.scan import update_config, write_requirements_file - config_location = config or os.path.join(os.path.dirname(filename), "config.json") + config_location = config or os.path.join(os.path.dirname(filename), CONFIG_FILE) click.echo( "Dumping scan results to config file: " + click.style(config_location, fg="blue", bold=True) diff --git a/src/datacustomcode/constants.py b/src/datacustomcode/constants.py new file mode 100644 index 0000000..e0f3b2c --- /dev/null +++ b/src/datacustomcode/constants.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, Salesforce, Inc. +# SPDX-License-Identifier: Apache-2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constants used throughout the datacustomcode package.""" + +# File and directory names +ENTRYPOINT_FILE = "entrypoint.py" +CONFIG_FILE = "config.json" +PAYLOAD_DIR = "payload" +TESTS_DIR = "tests" +TEST_FILE = "test.json" +REQUIREMENTS_FILE = "requirements.txt" + +# Default values +DEFAULT_PROFILE = "default" +DEFAULT_NETWORK = "default" +DEFAULT_CPU_SIZE = "CPU_2XL" + +# Feature to template folder mapping +FEATURE_TEMPLATE_MAPPING = { + "SearchIndexChunking": "chunking", +} + +# Feature name to Connect API name mapping +USE_IN_FEATURE_MAPPING_FOR_CONNECT_API = { + "SearchIndexChunking": "UnstructuredChunking", +} + +# Pydantic request/response type names to feature names +REQUEST_TYPE_TO_FEATURE = { + "SearchIndexChunkingV1Request": "SearchIndexChunking", + "SearchIndexChunkingV1Response": "SearchIndexChunking", +} \ No newline at end of file diff --git a/src/datacustomcode/deploy.py b/src/datacustomcode/deploy.py index 4249a59..db26e3c 100644 --- a/src/datacustomcode/deploy.py +++ b/src/datacustomcode/deploy.py @@ -35,6 +35,10 @@ import requests from datacustomcode.cmd import cmd_output +from datacustomcode.constants import ( + REQUEST_TYPE_TO_FEATURE, + USE_IN_FEATURE_MAPPING_FOR_CONNECT_API, +) from datacustomcode.scan import find_base_directory, get_package_type DATA_CUSTOM_CODE_PATH = "services/data/v63.0/ssot/data-custom-code" @@ -65,32 +69,23 @@ def _sanitize_api_name(name: str) -> str: return sanitized -# Mapping from user-facing feature names to internal API names -USE_IN_FEATURE_MAPPING_FOR_CONNECT_API = { - "SearchIndexChunking": "UnstructuredChunking", -} - -# Mapping from Pydantic request/response types to feature names -REQUEST_TYPE_TO_FEATURE = { - "SearchIndexChunkingV1Request": "SearchIndexChunking", - "SearchIndexChunkingV1Response": "SearchIndexChunking", -} - def infer_use_in_feature(entrypoint_path: str) -> Union[str, None]: """Infer the use_in_feature from function signature. Checks both the request parameter type and return type annotation. Both must map to the same feature for a valid inference. + Uses static AST parsing to avoid importing dependencies. + Args: entrypoint_path: Path to the entrypoint.py file Returns: The feature name if both request and response match, None otherwise """ - from datacustomcode.function_utils import inspect_function_types + from datacustomcode.function_utils import inspect_function_types_static - request_type_name, response_type_name = inspect_function_types(entrypoint_path) + request_type_name, response_type_name = inspect_function_types_static(entrypoint_path) if not request_type_name or not response_type_name: return None diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py index fe0727e..1dd57a4 100644 --- a/src/datacustomcode/function_utils.py +++ b/src/datacustomcode/function_utils.py @@ -15,6 +15,7 @@ """Utilities for inspecting and working with function entrypoints.""" +import ast import importlib.util import inspect import json @@ -107,6 +108,93 @@ def get_function_signature_types( return request_type, response_type, request_type_name, response_type_name +def inspect_function_types_static(entrypoint_path: str) -> Tuple[Optional[str], Optional[str]]: + """Inspect function types using static AST parsing (no imports). + + This parses the Python file without executing it, so it doesn't + require dependencies to be installed. + + Args: + entrypoint_path: Path to the entrypoint.py file + + Returns: + Tuple of (request_type_name, response_type_name) + """ + try: + with open(entrypoint_path, 'r') as f: + tree = ast.parse(f.read(), filename=entrypoint_path) + + # Find the 'function' definition + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) and node.name == "function": + # Get request type (first parameter annotation) + request_type_name = None + if node.args.args and len(node.args.args) > 0: + first_param = node.args.args[0] + if first_param.annotation: + request_type_name = _get_type_name_from_ast(first_param.annotation) + + # Get response type (return annotation) + response_type_name = None + if node.returns: + response_type_name = _get_type_name_from_ast(node.returns) + + return request_type_name, response_type_name + + return None, None + except Exception: + return None, None + + +def _get_type_name_from_ast(annotation) -> Optional[str]: + """Extract type name from an AST annotation node.""" + if isinstance(annotation, ast.Name): + # Simple type: MyType + return annotation.id + elif isinstance(annotation, ast.Attribute): + # Module.Type - just return the type name + return annotation.attr + elif isinstance(annotation, ast.Subscript): + # Generic type: List[MyType], Optional[MyType] + # Return the base type name + return _get_type_name_from_ast(annotation.value) + return None + + +def _import_pydantic_model(entrypoint_path: str, type_name: str) -> Optional[Any]: + """Import a Pydantic model by finding its import statement. + + Parses the entrypoint to find where the type is imported from, + then imports just that module (not the entrypoint itself). + + Args: + entrypoint_path: Path to entrypoint.py + type_name: Name of the type to import (e.g., "SearchIndexChunkingV1Request") + + Returns: + The Pydantic model class, or None if not found + """ + try: + with open(entrypoint_path, 'r') as f: + tree = ast.parse(f.read(), filename=entrypoint_path) + + # Find where this type is imported from + for node in ast.walk(tree): + if isinstance(node, ast.ImportFrom): + # from module import Type1, Type2 + for alias in node.names: + if alias.name == type_name: + # Found it! Import from the module + module_name = node.module + if module_name: + module = importlib.import_module(module_name) + return getattr(module, type_name, None) + + return None + except Exception: + return None + + def inspect_function_types( entrypoint_path: str, ) -> Tuple[Optional[str], Optional[str]]: @@ -230,17 +318,29 @@ def generate_sample_value(field_type, field_name: str): def generate_test_json(entrypoint_path: str, output_path: str) -> None: """Generate a sample test.json file for a function. + First tries static AST parsing to get type names, then uses those + to import only the Pydantic model classes (not the entrypoint). + Args: entrypoint_path: Path to the function entrypoint.py output_path: Output path for test.json Raises: - ImportError: If the module cannot be loaded - AttributeError: If the function is not found - ValueError: If the request type is not a Pydantic model + ImportError: If the Pydantic model cannot be loaded + ValueError: If the request type is not found or not a Pydantic model """ - # Get the request type - request_type = get_request_type(entrypoint_path) + # First, get the type name using static parsing (no imports) + request_type_name, _ = inspect_function_types_static(entrypoint_path) + + if not request_type_name: + raise ValueError("Could not determine request type from function signature") + + # Now try to import the Pydantic model class + # Look for it in the entrypoint's imports + request_type = _import_pydantic_model(entrypoint_path, request_type_name) + + if not request_type: + raise ValueError(f"Could not import Pydantic model: {request_type_name}") # Check if it's a Pydantic model if not hasattr(request_type, "model_fields"): diff --git a/src/datacustomcode/template.py b/src/datacustomcode/template.py index 15c1e11..6f52624 100644 --- a/src/datacustomcode/template.py +++ b/src/datacustomcode/template.py @@ -17,6 +17,8 @@ from loguru import logger +from datacustomcode.constants import FEATURE_TEMPLATE_MAPPING + script_template_dir = os.path.join(os.path.dirname(__file__), "templates", "script") function_template_dir = os.path.join(os.path.dirname(__file__), "templates", "function") @@ -37,23 +39,18 @@ def copy_script_template(target_dir: str) -> None: shutil.copy2(source, destination) -MAPPED_FOLDER = {"SearchIndexChunking": "chunking"} - - def copy_function_template(target_dir: str, use_in_feature: str) -> None: os.makedirs(target_dir, exist_ok=True) - if use_in_feature and use_in_feature in MAPPED_FOLDER: - feature_function_template_dir = os.path.join( - function_template_dir, MAPPED_FOLDER[use_in_feature] - ) - else: - feature_function_template_dir = function_template_dir - - for item in os.listdir(feature_function_template_dir): - source = os.path.join(feature_function_template_dir, item) + # First, copy common files from base function template + for item in os.listdir(function_template_dir): + source = os.path.join(function_template_dir, item) destination = os.path.join(target_dir, item) + # Skip feature-specific subdirectories + if os.path.isdir(source) and item in FEATURE_TEMPLATE_MAPPING.values(): + continue + if os.path.isdir(source): logger.debug(f"Copying directory {source} to {destination}...") shutil.copytree(source, destination, dirs_exist_ok=True) @@ -61,4 +58,21 @@ def copy_function_template(target_dir: str, use_in_feature: str) -> None: logger.debug(f"Copying file {source} to {destination}...") shutil.copy2(source, destination) + # Then, copy feature-specific files (overwriting if needed) + if use_in_feature and use_in_feature in FEATURE_TEMPLATE_MAPPING: + feature_function_template_dir = os.path.join( + function_template_dir, FEATURE_TEMPLATE_MAPPING[use_in_feature] + ) + + for item in os.listdir(feature_function_template_dir): + source = os.path.join(feature_function_template_dir, item) + destination = os.path.join(target_dir, item) + + if os.path.isdir(source): + logger.debug(f"Copying feature-specific directory {source} to {destination}...") + shutil.copytree(source, destination, dirs_exist_ok=True) + else: + logger.debug(f"Copying feature-specific file {source} to {destination}...") + shutil.copy2(source, destination) + diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py index d6be950..4796ef2 100644 --- a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +++ b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py @@ -1,74 +1,71 @@ import logging -from datacustomcode.function import Runtime - -logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - +from langchain_text_splitters import RecursiveCharacterTextSplitter +from datacustomcode.function import Runtime from datacustomcode.function.feature_types.chunking import ( SearchIndexChunkingV1Request, SearchIndexChunkingV1Response, SearchIndexChunkOutput, - SearchIndexStatusResponse + SearchIndexStatusResponse, ) +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + -def function(request: SearchIndexChunkingV1Request, runtime: Runtime) -> SearchIndexChunkingV1Response: +def function( + request: SearchIndexChunkingV1Request, runtime: Runtime +) -> SearchIndexChunkingV1Response: print(f"Received {len(request.input)} documents to chunk") print(f"Max characters per chunk: {request.max_characters}") + # Initialize RecursiveCharacterTextSplitter + # It tries to split on: "\n\n", "\n", " ", "" (in that order) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=request.max_characters, + chunk_overlap=20, # Small overlap to maintain context + length_function=len, + separators=["\n\n", "\n", " ", ""], + ) + chunks = [] chunk_id = 1 # Process each document for doc_idx, doc in enumerate(request.input): - # Access fields - works identically in both Pydantic and betterproto! text = doc.text - metadata = doc.metadata if hasattr(doc.metadata, '__iter__') else {} + metadata = doc.metadata if hasattr(doc.metadata, "__iter__") else {} print(f"📄 Processing document {doc_idx + 1}: {len(text)} characters") - # Chunk the text - max_chars = request.max_characters - chunk_start = 0 - - while chunk_start < len(text): - chunk_end = min(chunk_start + max_chars, len(text)) - chunk_text = text[chunk_start:chunk_end] - - # Try to break at word boundary if not at end - if chunk_end < len(text) and not text[chunk_end].isspace(): - # Look for last space in chunk - last_space = chunk_text.rfind(' ') - if last_space > max_chars * 0.8: # Only if space is in last 20% - chunk_end = chunk_start + last_space - chunk_text = text[chunk_start:chunk_end] - + # Split the text using RecursiveCharacterTextSplitter + text_chunks = text_splitter.split_text(text) - # Create ChunkOutput object + # Create chunk outputs + for chunk_text in text_chunks: chunk_output = SearchIndexChunkOutput( chunk_id=f"chunk_{chunk_id:04d}", chunk_type="text", text=chunk_text.strip(), seq_no=chunk_id, - metadata={k: str(v) for k, v in (dict(metadata) if metadata else {}).items()}, + metadata={ + k: str(v) for k, v in (dict(metadata) if metadata else {}).items() + }, tag_metadata={}, - citations={} + citations={}, ) chunks.append(chunk_output) print(f" ✂️ Chunk {chunk_id}: {len(chunk_text)} chars") chunk_id += 1 - chunk_start = chunk_end print(f"✅ Generated {len(chunks)} chunks total") - # Return UdsChunkingV1BatchResponse object return SearchIndexChunkingV1Response( output=chunks, status=SearchIndexStatusResponse( status_type="success", - status_message=f"Successfully chunked {len(request.input)} documents into {len(chunks)} chunks" - ) + status_message=f"Successfully chunked {len(request.input)} documents into {len(chunks)} chunks", + ), ) From 5cf33a90d64f55302b4ade252fbf87d1074511a1 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 17:48:14 +0530 Subject: [PATCH 05/19] Updating sf_cli_integration.yml --- .github/workflows/sf_cli_integration.yml | 6 +++++- .../templates/function/chunking/requirements.txt | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 src/datacustomcode/templates/function/chunking/requirements.txt diff --git a/.github/workflows/sf_cli_integration.yml b/.github/workflows/sf_cli_integration.yml index b9ff172..6a046f5 100644 --- a/.github/workflows/sf_cli_integration.yml +++ b/.github/workflows/sf_cli_integration.yml @@ -200,6 +200,10 @@ jobs: echo "::error::testFunction/.datacustomcode_proj/sdk_config.json not found after function init." exit 1 } + test -f testFunction/payload/tests/test.json || { + echo "::error::testFunction/payload/tests/test.json not found after function init." + exit 1 + } # ── Function: scan ──────────────────────────────────────────────────────── @@ -251,7 +255,7 @@ jobs: # ── Function: run ───────────────────────────────────────────────────────── - - name: '[function] run — sf data-code-extension function run --entrypoint testFunction/payload/entrypoint.py -o dev1' + - name: '[function] run — sf data-code-extension function run --entrypoint testFunction/payload/entrypoint.py --test_with testFunction/payload/tests/test.json -o dev1' run: | sf data-code-extension function run \ --entrypoint testFunction/payload/entrypoint.py \ diff --git a/src/datacustomcode/templates/function/chunking/requirements.txt b/src/datacustomcode/templates/function/chunking/requirements.txt new file mode 100644 index 0000000..f872675 --- /dev/null +++ b/src/datacustomcode/templates/function/chunking/requirements.txt @@ -0,0 +1,2 @@ +# Packages required for the chunking function +langchain-text-splitters>=0.3.0 \ No newline at end of file From 874f821332d981d2ad127f32a0cd47e824be0eef Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 17:50:14 +0530 Subject: [PATCH 06/19] Updating sf_cli_integration.yml --- .github/workflows/sf_cli_integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sf_cli_integration.yml b/.github/workflows/sf_cli_integration.yml index 6a046f5..0d9c17a 100644 --- a/.github/workflows/sf_cli_integration.yml +++ b/.github/workflows/sf_cli_integration.yml @@ -259,6 +259,7 @@ jobs: run: | sf data-code-extension function run \ --entrypoint testFunction/payload/entrypoint.py \ + --test_with testFunction/payload/tests/test.json \ -o dev1 || { echo "::error::sf data-code-extension function run FAILED. Check mock server output above; the --entrypoint flag or SF CLI org auth contract may have changed." exit 1 From f9b0deb9f4f76d9b9c600bbd33cae91293bc139d Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 17:53:30 +0530 Subject: [PATCH 07/19] Make lint --- src/datacustomcode/cli.py | 18 +++++++------ src/datacustomcode/constants.py | 2 +- src/datacustomcode/deploy.py | 9 +++---- .../function/feature_types/chunking.py | 6 +++-- src/datacustomcode/function_utils.py | 25 ++++++++++++------- src/datacustomcode/run.py | 4 ++- src/datacustomcode/template.py | 10 +++++--- .../function/chunking/requirements.txt | 2 +- 8 files changed, 46 insertions(+), 30 deletions(-) diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index 78eaa74..381deff 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -210,8 +210,8 @@ def deploy( ): from datacustomcode.deploy import ( COMPUTE_TYPES, - CodeExtensionMetadata, USE_IN_FEATURE_MAPPING_FOR_CONNECT_API, + CodeExtensionMetadata, deploy_full, infer_use_in_feature, ) @@ -255,7 +255,9 @@ def deploy( raise click.Abort() # Map user-provided feature names to API names - mapped_feature = USE_IN_FEATURE_MAPPING_FOR_CONNECT_API.get(use_in_feature, use_in_feature) + mapped_feature = USE_IN_FEATURE_MAPPING_FOR_CONNECT_API.get( + use_in_feature, use_in_feature + ) metadata.functionInvokeOptions = [mapped_feature] try: @@ -286,10 +288,7 @@ def init(directory: str, code_type: str, use_in_feature: Optional[str]): update_config, write_sdk_config, ) - from datacustomcode.template import ( - copy_function_template, - copy_script_template, - ) + from datacustomcode.template import copy_function_template, copy_script_template click.echo("Copying template to " + click.style(directory, fg="blue", bold=True)) if code_type == "script": @@ -397,5 +396,10 @@ def run( from datacustomcode.run import run_entrypoint run_entrypoint( - entrypoint, config_file, dependencies, profile, test_file=test_with, sf_cli_org=sf_cli_org + entrypoint, + config_file, + dependencies, + profile, + test_file=test_with, + sf_cli_org=sf_cli_org, ) diff --git a/src/datacustomcode/constants.py b/src/datacustomcode/constants.py index e0f3b2c..76b6a7c 100644 --- a/src/datacustomcode/constants.py +++ b/src/datacustomcode/constants.py @@ -42,4 +42,4 @@ REQUEST_TYPE_TO_FEATURE = { "SearchIndexChunkingV1Request": "SearchIndexChunking", "SearchIndexChunkingV1Response": "SearchIndexChunking", -} \ No newline at end of file +} diff --git a/src/datacustomcode/deploy.py b/src/datacustomcode/deploy.py index db26e3c..65495e6 100644 --- a/src/datacustomcode/deploy.py +++ b/src/datacustomcode/deploy.py @@ -35,10 +35,7 @@ import requests from datacustomcode.cmd import cmd_output -from datacustomcode.constants import ( - REQUEST_TYPE_TO_FEATURE, - USE_IN_FEATURE_MAPPING_FOR_CONNECT_API, -) +from datacustomcode.constants import REQUEST_TYPE_TO_FEATURE from datacustomcode.scan import find_base_directory, get_package_type DATA_CUSTOM_CODE_PATH = "services/data/v63.0/ssot/data-custom-code" @@ -85,7 +82,9 @@ def infer_use_in_feature(entrypoint_path: str) -> Union[str, None]: """ from datacustomcode.function_utils import inspect_function_types_static - request_type_name, response_type_name = inspect_function_types_static(entrypoint_path) + request_type_name, response_type_name = inspect_function_types_static( + entrypoint_path + ) if not request_type_name or not response_type_name: return None diff --git a/src/datacustomcode/function/feature_types/chunking.py b/src/datacustomcode/function/feature_types/chunking.py index 53b9860..1a2f1d7 100644 --- a/src/datacustomcode/function/feature_types/chunking.py +++ b/src/datacustomcode/function/feature_types/chunking.py @@ -22,7 +22,6 @@ Any, Dict, List, - Literal, ) from pydantic import BaseModel, Field @@ -76,7 +75,10 @@ class SearchIndexChunkingV1Request(BaseModel): class SearchIndexChunkingV1Response(BaseModel): """Batch response for UDS chunking""" + output: List[SearchIndexChunkOutput] = Field( default_factory=list, description="Flat list of chunks from all docs" ) - status: SearchIndexStatusResponse = Field(..., description="Overall operation status") + status: SearchIndexStatusResponse = Field( + ..., description="Overall operation status" + ) diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py index 1dd57a4..f803b7a 100644 --- a/src/datacustomcode/function_utils.py +++ b/src/datacustomcode/function_utils.py @@ -19,9 +19,12 @@ import importlib.util import inspect import json -import sys import typing -from typing import Any, Optional, Tuple +from typing import ( + Any, + Optional, + Tuple, +) def load_function_module(entrypoint_path: str, module_name: str = "function_module"): @@ -59,7 +62,7 @@ def get_function_callable(module): AttributeError: If module doesn't have a 'function' attribute """ if not hasattr(module, "function"): - raise AttributeError(f"Module does not have a 'function' callable") + raise AttributeError("Module does not have a 'function' callable") return module.function @@ -108,7 +111,9 @@ def get_function_signature_types( return request_type, response_type, request_type_name, response_type_name -def inspect_function_types_static(entrypoint_path: str) -> Tuple[Optional[str], Optional[str]]: +def inspect_function_types_static( + entrypoint_path: str, +) -> Tuple[Optional[str], Optional[str]]: """Inspect function types using static AST parsing (no imports). This parses the Python file without executing it, so it doesn't @@ -121,7 +126,7 @@ def inspect_function_types_static(entrypoint_path: str) -> Tuple[Optional[str], Tuple of (request_type_name, response_type_name) """ try: - with open(entrypoint_path, 'r') as f: + with open(entrypoint_path, "r") as f: tree = ast.parse(f.read(), filename=entrypoint_path) # Find the 'function' definition @@ -132,7 +137,9 @@ def inspect_function_types_static(entrypoint_path: str) -> Tuple[Optional[str], if node.args.args and len(node.args.args) > 0: first_param = node.args.args[0] if first_param.annotation: - request_type_name = _get_type_name_from_ast(first_param.annotation) + request_type_name = _get_type_name_from_ast( + first_param.annotation + ) # Get response type (return annotation) response_type_name = None @@ -175,7 +182,7 @@ def _import_pydantic_model(entrypoint_path: str, type_name: str) -> Optional[Any The Pydantic model class, or None if not found """ try: - with open(entrypoint_path, 'r') as f: + with open(entrypoint_path, "r") as f: tree = ast.parse(f.read(), filename=entrypoint_path) # Find where this type is imported from @@ -344,7 +351,7 @@ def generate_test_json(entrypoint_path: str, output_path: str) -> None: # Check if it's a Pydantic model if not hasattr(request_type, "model_fields"): - raise ValueError(f"Request parameter type must be a Pydantic model") + raise ValueError("Request parameter type must be a Pydantic model") # Generate sample data for ALL fields (use defaults where available) sample_data = _generate_model_sample_data(request_type) @@ -352,4 +359,4 @@ def generate_test_json(entrypoint_path: str, output_path: str) -> None: # Write to file with open(output_path, "w") as f: - json.dump(sample_instance.model_dump(), f, indent=2) \ No newline at end of file + json.dump(sample_instance.model_dump(), f, indent=2) diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index 605bc00..004f724 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -177,7 +177,9 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None: try: request = request_type(**test_data) except Exception as e: - raise ValueError(f"Failed to parse test data as {request_type.__name__}: {e}") from e + raise ValueError( + f"Failed to parse test data as {request_type.__name__}: {e}" + ) from e # Import Runtime from datacustomcode.function import Runtime diff --git a/src/datacustomcode/template.py b/src/datacustomcode/template.py index 6f52624..8fb67ad 100644 --- a/src/datacustomcode/template.py +++ b/src/datacustomcode/template.py @@ -69,10 +69,12 @@ def copy_function_template(target_dir: str, use_in_feature: str) -> None: destination = os.path.join(target_dir, item) if os.path.isdir(source): - logger.debug(f"Copying feature-specific directory {source} to {destination}...") + logger.debug( + f"Copying feature-specific directory {source} to {destination}..." + ) shutil.copytree(source, destination, dirs_exist_ok=True) else: - logger.debug(f"Copying feature-specific file {source} to {destination}...") + logger.debug( + f"Copying feature-specific file {source} to {destination}..." + ) shutil.copy2(source, destination) - - diff --git a/src/datacustomcode/templates/function/chunking/requirements.txt b/src/datacustomcode/templates/function/chunking/requirements.txt index f872675..7f5990c 100644 --- a/src/datacustomcode/templates/function/chunking/requirements.txt +++ b/src/datacustomcode/templates/function/chunking/requirements.txt @@ -1,2 +1,2 @@ # Packages required for the chunking function -langchain-text-splitters>=0.3.0 \ No newline at end of file +langchain-text-splitters>=0.3.0 From 3702b2b0fe74680f2ac56724b786af78bb7ba346 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 17:57:27 +0530 Subject: [PATCH 08/19] Make lint --- src/datacustomcode/cli.py | 2 +- src/datacustomcode/function_utils.py | 6 +++--- src/datacustomcode/template.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index 381deff..c061371 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -208,9 +208,9 @@ def deploy( network: str, sf_cli_org: Optional[str], ): + from datacustomcode.constants import USE_IN_FEATURE_MAPPING_FOR_CONNECT_API from datacustomcode.deploy import ( COMPUTE_TYPES, - USE_IN_FEATURE_MAPPING_FOR_CONNECT_API, CodeExtensionMetadata, deploy_full, infer_use_in_feature, diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py index f803b7a..e1c91aa 100644 --- a/src/datacustomcode/function_utils.py +++ b/src/datacustomcode/function_utils.py @@ -79,7 +79,7 @@ def get_type_name(type_annotation: Any) -> Optional[str]: return None if hasattr(type_annotation, "__name__"): - return type_annotation.__name__ + return str(type_annotation.__name__) return str(type_annotation) @@ -230,14 +230,14 @@ def inspect_function_types( return None, None -def get_request_type(entrypoint_path: str) -> Optional[Any]: +def get_request_type(entrypoint_path: str) -> Any: """Get the request type annotation from a function entrypoint. Args: entrypoint_path: Path to the entrypoint.py file Returns: - The request type (Pydantic model class), or None if not found + The request type (Pydantic model class) Raises: ImportError: If the module cannot be loaded diff --git a/src/datacustomcode/template.py b/src/datacustomcode/template.py index 8fb67ad..6807510 100644 --- a/src/datacustomcode/template.py +++ b/src/datacustomcode/template.py @@ -14,6 +14,7 @@ # limitations under the License. import os import shutil +from typing import Optional from loguru import logger @@ -39,7 +40,7 @@ def copy_script_template(target_dir: str) -> None: shutil.copy2(source, destination) -def copy_function_template(target_dir: str, use_in_feature: str) -> None: +def copy_function_template(target_dir: str, use_in_feature: Optional[str]) -> None: os.makedirs(target_dir, exist_ok=True) # First, copy common files from base function template From 6a2b7bdeda74e7924c9aca85813750f407ac5355 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Wed, 29 Apr 2026 22:34:34 +0530 Subject: [PATCH 09/19] changing the argument name --- src/datacustomcode/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index c061371..a3aeda2 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -280,7 +280,7 @@ def deploy( @click.option( "--use-in-feature", default="SearchIndexChunking", - help="Feature to invoke the function (only applicable for functions). If not provided, will be inferred from function signature.", + help="Feature where this function will be used (only applicable for function).", ) def init(directory: str, code_type: str, use_in_feature: Optional[str]): from datacustomcode.scan import ( @@ -331,7 +331,7 @@ def init(directory: str, code_type: str, use_in_feature: Optional[str]): click.echo( "Test your function locally with " + click.style( - f"datacustomcode run {entrypoint_path} --test_with {test_json_path}", + f"datacustomcode run {entrypoint_path} --test-with {test_json_path}", fg="blue", bold=True, ) @@ -375,7 +375,7 @@ def scan(filename: str, config: str, dry_run: bool, no_requirements: bool): @click.option("--dependencies", default=[], multiple=True) @click.option("--profile", default="default") @click.option( - "--test_with", + "--test-with", default=None, type=click.Path(exists=True), help="Path to test JSON file for function testing", From b0608ea8c6082a6af8b789a9480d093831e21be2 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Thu, 30 Apr 2026 13:26:50 +0530 Subject: [PATCH 10/19] Removing function_invoke_option testcases --- tests/test_cli.py | 11 +++++------ tests/test_sf_cli_contract.py | 20 -------------------- 2 files changed, 5 insertions(+), 26 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index e26cbdc..7765560 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -103,16 +103,19 @@ def test_deploy_command_success(self, mock_token_provider, mock_deploy_full): assert call_args[0][2].access_token == "test_token" assert call_args[0][2].instance_url == "https://instance.example.com" + @patch("datacustomcode.deploy.infer_use_in_feature") @patch("datacustomcode.deploy.deploy_full") @patch("datacustomcode.token_provider.CredentialsTokenProvider") def test_deploy_command_function_invoke_options( - self, mock_token_provider, mock_deploy_full + self, mock_token_provider, mock_deploy_full, mock_infer_feature ): """Test deploy command with function invoke options.""" mock_provider_instance = mock_token_provider.return_value mock_provider_instance.get_token.return_value = AccessTokenResponse( access_token="test_token", instance_url="https://instance.example.com" ) + # Mock infer_use_in_feature to return a valid feature + mock_infer_feature.return_value = "SearchIndexChunking" runner = CliRunner() with runner.isolated_filesystem(): @@ -122,16 +125,12 @@ def test_deploy_command_function_invoke_options( write_sdk_config(".", sdk_config) result = runner.invoke( deploy, - ["--name", "test-job", "--function-invoke-opt", "option1,option2"], + ["--name", "test-job"], ) assert result.exit_code == 0 mock_deploy_full.assert_called_once() - # Check that deploy_full was called with function invoke options - call_args = mock_deploy_full.call_args - assert call_args[0][1].functionInvokeOptions == ["option1", "option2"] - @patch("datacustomcode.token_provider.CredentialsTokenProvider") def test_deploy_command_credentials_error(self, mock_token_provider): """Test deploy command when credentials are not available.""" diff --git a/tests/test_sf_cli_contract.py b/tests/test_sf_cli_contract.py index b96ab35..412b4c2 100644 --- a/tests/test_sf_cli_contract.py +++ b/tests/test_sf_cli_contract.py @@ -188,26 +188,6 @@ def test_accepts_network_flag( result = runner.invoke(deploy, [*self._BASE_ARGS, "--network", "custom"]) assert result.exit_code != 2, result.output - @patch("datacustomcode.token_provider.SFCLITokenProvider") - @patch("datacustomcode.deploy.deploy_full") - @patch("datacustomcode.cli.find_base_directory") - @patch("datacustomcode.cli.get_package_type") - def test_accepts_function_invoke_opt_flag( - self, mock_pkg_type, mock_find_base, mock_deploy_full, mock_sf_cli_provider - ): - mock_find_base.return_value = "payload" - mock_pkg_type.return_value = "function" - mock_provider_instance = mock_sf_cli_provider.return_value - mock_provider_instance.get_token.return_value = AccessTokenResponse( - access_token="tok", instance_url="https://example.com" - ) - runner = CliRunner() - result = runner.invoke( - deploy, [*self._BASE_ARGS, "--function-invoke-opt", "ASYNC"] - ) - assert result.exit_code != 2, result.output - - class TestRunArgContract: """ SF CLI spawn: From 9b3cd220c4c022c2b7bc3326caad9434b676bc8c Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Thu, 30 Apr 2026 17:27:16 +0530 Subject: [PATCH 11/19] Adding testcase for function_utils.py --- tests/test_function_utils.py | 247 +++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 tests/test_function_utils.py diff --git a/tests/test_function_utils.py b/tests/test_function_utils.py new file mode 100644 index 0000000..f5d7bba --- /dev/null +++ b/tests/test_function_utils.py @@ -0,0 +1,247 @@ +# Copyright (c) 2025, Salesforce, Inc. +# SPDX-License-Identifier: Apache-2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import sys +import tempfile +import textwrap +from typing import List + +import pytest +from pydantic import BaseModel + +from datacustomcode import function_utils + + +class SampleRequest(BaseModel): + message: str + count: int = 5 + tags: List[str] = [] + version: str = "v1" + + +@pytest.fixture +def sample_entrypoint(): + """Create a temporary entrypoint file with a function.""" + with tempfile.NamedTemporaryFile( + mode="w", suffix=".py", delete=False + ) as temp_file: + entrypoint_content = textwrap.dedent( + """ + from typing import List + from pydantic import BaseModel + + class SampleRequest(BaseModel): + message: str + count: int = 5 + tags: List[str] = [] + version: str = "v1" + + class SampleResponse(BaseModel): + result: str + success: bool = True + + def function(request: SampleRequest) -> SampleResponse: + return SampleResponse(result=f"Processed {request.message}") + """ + ) + temp_file.write(entrypoint_content) + temp_file_path = temp_file.name + + yield temp_file_path + + if os.path.exists(temp_file_path): + os.unlink(temp_file_path) + + +@pytest.fixture +def entrypoint_no_annotations(): + """Create an entrypoint with no type annotations.""" + with tempfile.NamedTemporaryFile( + mode="w", suffix=".py", delete=False + ) as temp_file: + entrypoint_content = textwrap.dedent( + """ + def function(request): + return {"result": "no annotations"} + """ + ) + temp_file.write(entrypoint_content) + temp_file_path = temp_file.name + + yield temp_file_path + + if os.path.exists(temp_file_path): + os.unlink(temp_file_path) + + +def test_get_function_signature_types(sample_entrypoint, entrypoint_no_annotations): + """Test extracting request and response types from function signatures.""" + module = function_utils.load_function_module(sample_entrypoint) + func = function_utils.get_function_callable(module) + req_type, resp_type, req_name, resp_name = ( + function_utils.get_function_signature_types(func) + ) + + assert req_name == "SampleRequest" + assert resp_name == "SampleResponse" + assert req_type is not None + assert resp_type is not None + + module_no_annot = function_utils.load_function_module(entrypoint_no_annotations) + func_no_annot = function_utils.get_function_callable(module_no_annot) + req_type, resp_type, req_name, resp_name = ( + function_utils.get_function_signature_types(func_no_annot) + ) + + assert req_name is None + assert resp_name is None + + +def test_inspect_function_types_static(sample_entrypoint, entrypoint_no_annotations): + """Test static AST-based inspection of function types.""" + req_name, resp_name = function_utils.inspect_function_types_static( + sample_entrypoint + ) + assert req_name == "SampleRequest" + assert resp_name == "SampleResponse" + + req_name, resp_name = function_utils.inspect_function_types_static( + entrypoint_no_annotations + ) + assert req_name is None + assert resp_name is None + +def test_inspect_function_types(sample_entrypoint): + """Test dynamic inspection of function types.""" + req_name, resp_name = function_utils.inspect_function_types(sample_entrypoint) + assert req_name == "SampleRequest" + assert resp_name == "SampleResponse" + + req_name, resp_name = function_utils.inspect_function_types("/nonexistent/file.py") + assert req_name is None + assert resp_name is None + + +def test_get_request_type(sample_entrypoint, entrypoint_no_annotations): + """Test getting request type from entrypoint.""" + req_type = function_utils.get_request_type(sample_entrypoint) + assert req_type is not None + assert hasattr(req_type, "model_fields") + + with pytest.raises(ValueError, match="must have a type annotation"): + function_utils.get_request_type(entrypoint_no_annotations) + + +def test_generate_test_json(): + """Test generating test.json file from entrypoint with simple and complex nested types.""" + temp_dir = tempfile.mkdtemp() + models_file = os.path.join(temp_dir, "test_models.py") + + try: + # Test 1: Simple request type + entrypoint_simple = os.path.join(temp_dir, "entrypoint_simple.py") + output_simple = os.path.join(temp_dir, "test_simple.json") + + with open(models_file, "w") as f: + models_content = textwrap.dedent( + """ + from pydantic import BaseModel + from typing import List + + class SimpleRequest(BaseModel): + message: str + count: int = 5 + tags: List[str] = [] + version: str = "v1" + + class NestedConfig(BaseModel): + host: str + port: int = 8080 + enabled: bool = True + + class ComplexRequest(BaseModel): + name: str + max_items: int = 100 + config: NestedConfig + metadata: dict = {} + """ + ) + f.write(models_content) + + with open(entrypoint_simple, "w") as f: + entrypoint_content = textwrap.dedent( + """ + from test_models import SimpleRequest + + def function(request: SimpleRequest): + return {"result": "ok"} + """ + ) + f.write(entrypoint_content) + + sys.path.insert(0, temp_dir) + + function_utils.generate_test_json(entrypoint_simple, output_simple) + assert os.path.exists(output_simple) + + with open(output_simple, "r") as f: + data = json.load(f) + + assert "message" in data + assert data["count"] == 5 + assert data["version"] == "v1" + assert data["tags"] == [] + + # Test 2: Complex request type with nested models + entrypoint_complex = os.path.join(temp_dir, "entrypoint_complex.py") + output_complex = os.path.join(temp_dir, "test_complex.json") + + with open(entrypoint_complex, "w") as f: + entrypoint_content = textwrap.dedent( + """ + from test_models import ComplexRequest + + def function(request: ComplexRequest): + return {"result": "ok"} + """ + ) + f.write(entrypoint_content) + + function_utils.generate_test_json(entrypoint_complex, output_complex) + assert os.path.exists(output_complex) + + with open(output_complex, "r") as f: + complex_data = json.load(f) + + assert "name" in complex_data + assert "max_items" in complex_data + assert complex_data["max_items"] == 100 + assert "config" in complex_data + assert isinstance(complex_data["config"], dict) + assert "host" in complex_data["config"] + assert "port" in complex_data["config"] + assert complex_data["config"]["port"] == 8080 + assert complex_data["config"]["enabled"] is True + assert "metadata" in complex_data + assert complex_data["metadata"] == {} + + finally: + if temp_dir in sys.path: + sys.path.remove(temp_dir) + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) \ No newline at end of file From 45547c181b1ed52d79290ab67480516394b03edf Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Thu, 30 Apr 2026 17:50:23 +0530 Subject: [PATCH 12/19] Adding unit test --- tests/test_function_utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/test_function_utils.py b/tests/test_function_utils.py index f5d7bba..081c743 100644 --- a/tests/test_function_utils.py +++ b/tests/test_function_utils.py @@ -19,21 +19,12 @@ import sys import tempfile import textwrap -from typing import List import pytest -from pydantic import BaseModel from datacustomcode import function_utils -class SampleRequest(BaseModel): - message: str - count: int = 5 - tags: List[str] = [] - version: str = "v1" - - @pytest.fixture def sample_entrypoint(): """Create a temporary entrypoint file with a function.""" From 2c17783570f233e2e67ee4bc4e412aedf0ec3366 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Thu, 30 Apr 2026 18:40:18 +0530 Subject: [PATCH 13/19] Correcting the testcase --- .github/workflows/sf_cli_integration.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sf_cli_integration.yml b/.github/workflows/sf_cli_integration.yml index 0d9c17a..bc8a86b 100644 --- a/.github/workflows/sf_cli_integration.yml +++ b/.github/workflows/sf_cli_integration.yml @@ -255,11 +255,11 @@ jobs: # ── Function: run ───────────────────────────────────────────────────────── - - name: '[function] run — sf data-code-extension function run --entrypoint testFunction/payload/entrypoint.py --test_with testFunction/payload/tests/test.json -o dev1' + - name: '[function] run — sf data-code-extension function run --entrypoint testFunction/payload/entrypoint.py --test-with testFunction/payload/tests/test.json -o dev1' run: | sf data-code-extension function run \ --entrypoint testFunction/payload/entrypoint.py \ - --test_with testFunction/payload/tests/test.json \ + --test-with testFunction/payload/tests/test.json \ -o dev1 || { echo "::error::sf data-code-extension function run FAILED. Check mock server output above; the --entrypoint flag or SF CLI org auth contract may have changed." exit 1 From 8a9e8f082cd470b3ba0cb3adde7ec34b8d75a5f0 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Thu, 30 Apr 2026 18:58:29 +0530 Subject: [PATCH 14/19] Fixing lint error --- src/datacustomcode/cli.py | 6 ++++-- src/datacustomcode/function_utils.py | 4 +++- src/datacustomcode/run.py | 3 ++- .../function/chunking/payload/entrypoint.py | 5 ++++- tests/test_function_utils.py | 13 +++++-------- tests/test_sf_cli_contract.py | 1 + 6 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index a3aeda2..84da911 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -249,7 +249,8 @@ def deploy( logger.info(f"Inferred use_in_feature: {use_in_feature}") else: click.secho( - "Error: Could not infer function invoke options. Please provide --use-in-feature", + "Error: Could not infer function invoke options. " + "Please provide --use-in-feature", fg="red", ) raise click.Abort() @@ -331,7 +332,8 @@ def init(directory: str, code_type: str, use_in_feature: Optional[str]): click.echo( "Test your function locally with " + click.style( - f"datacustomcode run {entrypoint_path} --test-with {test_json_path}", + f"datacustomcode run {entrypoint_path} " + f"--test-with {test_json_path}", fg="blue", bold=True, ) diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py index e1c91aa..8e6f12e 100644 --- a/src/datacustomcode/function_utils.py +++ b/src/datacustomcode/function_utils.py @@ -215,7 +215,9 @@ def inspect_function_types( Either can be None if not found or on error Example: - >>> request_name, response_name = inspect_function_types("payload/entrypoint.py") + >>> request_name, response_name = inspect_function_types( + ... "payload/entrypoint.py" + ... ) >>> print(request_name) # "SearchIndexChunkingV1Request" >>> print(response_name) # "SearchIndexChunkingV1Response" """ diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index 004f724..6322270 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -164,7 +164,8 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None: load_function_module, ) - # Import the entrypoint module in the current environment (with all dependencies loaded) + # Import the entrypoint module in the current environment + # (with all dependencies loaded) module = load_function_module(entrypoint, "entrypoint_module") function_callable = get_function_callable(module) request_type = get_request_type(entrypoint) diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py index 4796ef2..baa9a31 100644 --- a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +++ b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py @@ -66,6 +66,9 @@ def function( output=chunks, status=SearchIndexStatusResponse( status_type="success", - status_message=f"Successfully chunked {len(request.input)} documents into {len(chunks)} chunks", + status_message=( + f"Successfully chunked {len(request.input)} documents " + f"into {len(chunks)} chunks" + ), ), ) diff --git a/tests/test_function_utils.py b/tests/test_function_utils.py index 081c743..cc0f51d 100644 --- a/tests/test_function_utils.py +++ b/tests/test_function_utils.py @@ -28,9 +28,7 @@ @pytest.fixture def sample_entrypoint(): """Create a temporary entrypoint file with a function.""" - with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False - ) as temp_file: + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as temp_file: entrypoint_content = textwrap.dedent( """ from typing import List @@ -62,9 +60,7 @@ def function(request: SampleRequest) -> SampleResponse: @pytest.fixture def entrypoint_no_annotations(): """Create an entrypoint with no type annotations.""" - with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False - ) as temp_file: + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as temp_file: entrypoint_content = textwrap.dedent( """ def function(request): @@ -117,6 +113,7 @@ def test_inspect_function_types_static(sample_entrypoint, entrypoint_no_annotati assert req_name is None assert resp_name is None + def test_inspect_function_types(sample_entrypoint): """Test dynamic inspection of function types.""" req_name, resp_name = function_utils.inspect_function_types(sample_entrypoint) @@ -139,7 +136,7 @@ def test_get_request_type(sample_entrypoint, entrypoint_no_annotations): def test_generate_test_json(): - """Test generating test.json file from entrypoint with simple and complex nested types.""" + """Test generating test.json with simple and complex nested types.""" temp_dir = tempfile.mkdtemp() models_file = os.path.join(temp_dir, "test_models.py") @@ -235,4 +232,4 @@ def function(request: ComplexRequest): if temp_dir in sys.path: sys.path.remove(temp_dir) if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) \ No newline at end of file + shutil.rmtree(temp_dir) diff --git a/tests/test_sf_cli_contract.py b/tests/test_sf_cli_contract.py index 412b4c2..f53123e 100644 --- a/tests/test_sf_cli_contract.py +++ b/tests/test_sf_cli_contract.py @@ -188,6 +188,7 @@ def test_accepts_network_flag( result = runner.invoke(deploy, [*self._BASE_ARGS, "--network", "custom"]) assert result.exit_code != 2, result.output + class TestRunArgContract: """ SF CLI spawn: From e000a0333f60e9f603fd397f34d6be51a6dfd182 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Fri, 1 May 2026 09:47:27 +0530 Subject: [PATCH 15/19] Removing unnecessary emoji --- .../templates/function/chunking/payload/entrypoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py index baa9a31..311a51b 100644 --- a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +++ b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py @@ -57,10 +57,10 @@ def function( ) chunks.append(chunk_output) - print(f" ✂️ Chunk {chunk_id}: {len(chunk_text)} chars") + print(f"Chunk {chunk_id}: {len(chunk_text)} chars") chunk_id += 1 - print(f"✅ Generated {len(chunks)} chunks total") + print(f"Generated {len(chunks)} chunks total") return SearchIndexChunkingV1Response( output=chunks, From 6df64a5b2413c6f43244b6a4e7225a874990bbbd Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Fri, 1 May 2026 18:49:23 +0530 Subject: [PATCH 16/19] SearchIndexChunking contract --- .../function/feature_types/chunking.py | 165 +++++++++++++----- src/datacustomcode/function_utils.py | 5 +- 2 files changed, 129 insertions(+), 41 deletions(-) diff --git a/src/datacustomcode/function/feature_types/chunking.py b/src/datacustomcode/function/feature_types/chunking.py index 1a2f1d7..ab3108a 100644 --- a/src/datacustomcode/function/feature_types/chunking.py +++ b/src/datacustomcode/function/feature_types/chunking.py @@ -14,71 +14,156 @@ # limitations under the License. """ -Pydantic models for byoc-function-proto (uds_chunking.proto) -Auto-generated - validation rules from buf.validate +Pydantic models for Search Index Chunking V1 """ - from typing import ( - Any, Dict, List, + Union ) -from pydantic import BaseModel, Field +from enum import Enum +from pydantic import BaseModel, Field, ConfigDict -class SearchIndexDocElement(BaseModel): - """Document element to be chunked""" +class DocumentType(str, Enum): + """Document type enumeration""" + TEXT = "Text" - text: str = Field(..., description="Text content to be chunked") - metadata: Dict[str, Any] = Field( - default_factory=dict, description="Source document metadata" - ) +class SearchIndexChunkingV1PrependField(BaseModel): + """Field to prepend to chunk content""" + dmo_name: str = Field( + default="", + description="Data Model Object name", + examples=["udmo_1__dlm"] + ) + field_name: str = Field( + default="", + description="Field name to prepend", + examples=["ResolvedFilePath__c"] + ) + value: str = Field( + default="", + description="Field value to prepend", + examples=["udlo_1__dll:quarterly_report.pdf"] + ) + model_config = ConfigDict(extra='ignore') -class SearchIndexChunkOutput(BaseModel): - """Output chunk from the chunking process""" - chunk_id: str = Field(..., description="UUID for this chunk") - chunk_type: str = Field(..., description="Type: 'text'") - text: str = Field(..., description="Chunk text content") - seq_no: int = Field(..., description="Sequential chunk number (1-based)") - metadata: Dict[str, str] = Field( - default_factory=dict, description="Metadata from source (DMO fields)" +class SearchIndexChunkingV1Metadata(BaseModel): + """Metadata for input documents""" + type: DocumentType = Field( + default=DocumentType.TEXT, + description="Document type (Text)", + examples=["Text"] ) - tag_metadata: Dict[str, Any] = Field( - default_factory=dict, description="Additional tags" + page_number: int = Field( + default=0, + description="Page number in the source document (0-based)", + examples=[1] ) - citations: Dict[str, Any] = Field( - default_factory=dict, description="Citation information" + speaker: str = Field( + default="", + description="Speaker name for audio/video transcripts", + examples=["Narrator"] ) + start_timestamp: str = Field( + default="", + description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff", + examples=["2026-03-25T02:01:24.918000"] + ) + end_timestamp: str = Field( + default="", + description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff", + examples=["2026-03-25T02:01:30.500000"] + ) + text_as_html: str = Field( + default="", + description="HTML representation of the document text", + examples=["

Online Remittance Instructions

"] + ) + source_dmo_fields: Dict[str, Union[str, int]] = Field( + default_factory=dict, + description="Source Data Model Object fields as key-value pairs (values can be string or int)", + examples=[ + { + "FilePath__c": "quarterly_report.pdf", + "Size__c": 1377454, + "ContentType__c": "pdf", + "LastModified__c": "2026-03-25T02:01:24.918000" + } + ] + ) + prepend: List[SearchIndexChunkingV1PrependField] = Field( + default_factory=list, + description="List of fields to prepend to each chunk" + ) + model_config = ConfigDict(extra='ignore') -class SearchIndexStatusResponse(BaseModel): - """Status response for operation""" +class SearchIndexChunkingV1DocElement(BaseModel): + """Document element to be chunked""" + text: str = Field( + default="", + description="Text content to be chunked", + examples=["Online Remittance Instructions\n\nTransfer proceeds from the sale of your ESOP/RSUs easily."] + ) + metadata: SearchIndexChunkingV1Metadata = Field( + default_factory=SearchIndexChunkingV1Metadata, + description="Source document metadata" + ) + model_config = ConfigDict(extra='ignore') - status_type: str = Field(..., description="'success' or 'error'") - status_message: str = Field(..., description="Human-readable status") +class SearchIndexChunkingV1Output(BaseModel): + """Output chunk from the chunking process""" + text: str = Field( + default="", + description="Chunk text content", + examples=["Online Remittance Instructions"] + ) + seq_no: int = Field( + default=0, + description="Sequential chunk number (1-based)", + ge=1, + examples=[1] + ) + chunk_id: str = Field( + default="", + description="Unique identifier for this chunk (UUID format)", + examples=["550e8400-e29b-41d4-a716-446655440000"] + ) + chunk_type: str = Field( + default="", + description="Type of chunk (e.g., 'text')", + examples=["text"] + ) + citations: Dict[str, str] = Field( + default_factory=dict, + description="Citation information as key-value pairs", + examples=[{"source": "quarterly_report.pdf"}] + ) + metadata: str = Field( + default="", + description="JSON string containing metadata about the chunking output", + examples=['{"page": 1}'] + ) + model_config = ConfigDict(extra='ignore') -class SearchIndexChunkingV1Request(BaseModel): - """Batch request for UDS chunking""" - input: List[SearchIndexDocElement] = Field( - ..., min_length=1, description="List of documents (min 1)" - ) - max_characters: int = Field(..., description="Max chars per chunk (default: 100)") - additional_params: Dict[str, Any] = Field( - default_factory=dict, description="Future extension point" +class SearchIndexChunkingV1Request(BaseModel): + """Request for Search Index Chunking""" + input: List[SearchIndexChunkingV1DocElement] = Field( + default_factory=list, + description="List of documents to be chunked" ) + model_config = ConfigDict(extra='ignore') class SearchIndexChunkingV1Response(BaseModel): """Batch response for UDS chunking""" - - output: List[SearchIndexChunkOutput] = Field( + output: List[SearchIndexChunkingV1Output] = Field( default_factory=list, description="Flat list of chunks from all docs" ) - status: SearchIndexStatusResponse = Field( - ..., description="Overall operation status" - ) + model_config = ConfigDict(extra='ignore') diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py index 8e6f12e..c499526 100644 --- a/src/datacustomcode/function_utils.py +++ b/src/datacustomcode/function_utils.py @@ -275,8 +275,11 @@ def _generate_model_sample_data(model_type): sample_data = {} for field_name, field_info in model_type.model_fields.items(): + # Use examples if available + if field_info.examples and len(field_info.examples) > 0: + sample_data[field_name] = field_info.examples[0] # Check if field has a real default value - if field_info.default is not PydanticUndefined: + elif field_info.default is not PydanticUndefined: sample_data[field_name] = field_info.default else: # Required field or field without default - generate sample From 2116e5ea03e69020a8ed6deca8ba825eaa530908 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Sat, 2 May 2026 08:59:52 +0530 Subject: [PATCH 17/19] Removing dependency from chunking example --- .github/workflows/sf_cli_integration.yml | 10 +- .../function/feature_types/chunking.py | 135 +++++++++------- .../function/chunking/payload/entrypoint.py | 151 +++++++++++++----- .../function/chunking/requirements.txt | 1 - 4 files changed, 196 insertions(+), 101 deletions(-) diff --git a/.github/workflows/sf_cli_integration.yml b/.github/workflows/sf_cli_integration.yml index bc8a86b..5a515d1 100644 --- a/.github/workflows/sf_cli_integration.yml +++ b/.github/workflows/sf_cli_integration.yml @@ -259,11 +259,10 @@ jobs: run: | sf data-code-extension function run \ --entrypoint testFunction/payload/entrypoint.py \ - --test-with testFunction/payload/tests/test.json \ - -o dev1 || { - echo "::error::sf data-code-extension function run FAILED. Check mock server output above; the --entrypoint flag or SF CLI org auth contract may have changed." - exit 1 - } + --test-with testFunction/payload/tests/test.json || { + echo "::error::sf data-code-extension function run FAILED. Check mock server output above; the --entrypoint flag or SF CLI org auth contract may have changed." + exit 1 + } # ── Function: deploy ───────────────────────────────────────────────────── @@ -275,7 +274,6 @@ jobs: --description "Test function deploy" \ --package-dir testFunction/payload \ --cpu-size CPU_2XL \ - --function-invoke-opt UnstructuredChunking \ -o dev1 || { echo "::error::sf data-code-extension function deploy FAILED. Check mock server output above for which endpoint failed. The deploy command flags or API contract may have changed." exit 1 diff --git a/src/datacustomcode/function/feature_types/chunking.py b/src/datacustomcode/function/feature_types/chunking.py index ab3108a..1c1a28a 100644 --- a/src/datacustomcode/function/feature_types/chunking.py +++ b/src/datacustomcode/function/feature_types/chunking.py @@ -16,154 +16,181 @@ """ Pydantic models for Search Index Chunking V1 """ +from enum import Enum from typing import ( Dict, List, - Union + Union, ) -from enum import Enum -from pydantic import BaseModel, Field, ConfigDict +from pydantic import ( + BaseModel, + ConfigDict, + Field, +) class DocumentType(str, Enum): """Document type enumeration""" + TEXT = "Text" + TITLE = "Title" + TABLE = "Table" + IMAGE = "Image" + LIST_ITEM = "ListItem" + CODE_SNIPPET = "CodeSnippet" + PAGE_METADATA = "PageMetadata" + + +class ChunkType(str, Enum): + TEXT = "text" class SearchIndexChunkingV1PrependField(BaseModel): """Field to prepend to chunk content""" + dmo_name: str = Field( - default="", - description="Data Model Object name", - examples=["udmo_1__dlm"] + default="", description="Data Model Object name", examples=["udmo_1__dlm"] ) field_name: str = Field( default="", description="Field name to prepend", - examples=["ResolvedFilePath__c"] + examples=["ResolvedFilePath__c"], ) value: str = Field( default="", description="Field value to prepend", - examples=["udlo_1__dll:quarterly_report.pdf"] + examples=["udlo_1__dll:quarterly_report.pdf"], ) - model_config = ConfigDict(extra='ignore') + model_config = ConfigDict(extra="ignore") -class SearchIndexChunkingV1Metadata(BaseModel): - """Metadata for input documents""" - type: DocumentType = Field( - default=DocumentType.TEXT, - description="Document type (Text)", - examples=["Text"] - ) - page_number: int = Field( - default=0, - description="Page number in the source document (0-based)", - examples=[1] - ) +class SearchIndexChunkingV1TranscriptField(BaseModel): + """Field to prepend to chunk content""" + speaker: str = Field( default="", description="Speaker name for audio/video transcripts", - examples=["Narrator"] + examples=["Agent"], ) start_timestamp: str = Field( default="", description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff", - examples=["2026-03-25T02:01:24.918000"] + examples=["2026-03-25T02:01:24.918000"], ) end_timestamp: str = Field( default="", description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff", - examples=["2026-03-25T02:01:30.500000"] + examples=["2026-03-25T02:01:30.500000"], + ) + model_config = ConfigDict(extra="ignore") + + +class SearchIndexChunkingV1Metadata(BaseModel): + """Metadata for input documents""" + + type: DocumentType = Field( + default=DocumentType.TEXT, description="Document type (Text)", examples=["Text"] + ) + transcript_fields: SearchIndexChunkingV1TranscriptField = Field( + default_factory=SearchIndexChunkingV1TranscriptField, + description=( + "Transcript information. Will only be there in case of audio-video files" + ), + ) + page_number: int = Field( + default=0, + description="Page number in the source document (0-based)", + examples=[1], ) text_as_html: str = Field( default="", description="HTML representation of the document text", - examples=["

Online Remittance Instructions

"] + examples=["

Online Remittance Instructions

"], ) source_dmo_fields: Dict[str, Union[str, int]] = Field( default_factory=dict, - description="Source Data Model Object fields as key-value pairs (values can be string or int)", + description=( + "Source Data Model Object fields as key-value pairs " + "(values can be string or int)" + ), examples=[ { "FilePath__c": "quarterly_report.pdf", "Size__c": 1377454, "ContentType__c": "pdf", - "LastModified__c": "2026-03-25T02:01:24.918000" + "LastModified__c": "2026-03-25T02:01:24.918000", } - ] + ], ) prepend: List[SearchIndexChunkingV1PrependField] = Field( - default_factory=list, - description="List of fields to prepend to each chunk" + default_factory=list, description="List of fields to prepend to each chunk" ) - model_config = ConfigDict(extra='ignore') + model_config = ConfigDict(extra="ignore") class SearchIndexChunkingV1DocElement(BaseModel): """Document element to be chunked""" + text: str = Field( default="", description="Text content to be chunked", - examples=["Online Remittance Instructions\n\nTransfer proceeds from the sale of your ESOP/RSUs easily."] + examples=[ + ( + "Online Remittance Instructions\n\n" + "Transfer proceeds from the sale of your ESOP/RSUs easily." + ) + ], ) metadata: SearchIndexChunkingV1Metadata = Field( default_factory=SearchIndexChunkingV1Metadata, - description="Source document metadata" + description="Source document metadata", ) - model_config = ConfigDict(extra='ignore') + model_config = ConfigDict(extra="ignore") class SearchIndexChunkingV1Output(BaseModel): """Output chunk from the chunking process""" + text: str = Field( default="", description="Chunk text content", - examples=["Online Remittance Instructions"] + examples=["Online Remittance Instructions"], ) seq_no: int = Field( - default=0, - description="Sequential chunk number (1-based)", - ge=1, - examples=[1] + default=0, description="Sequential chunk number (1-based)", ge=1, examples=[1] ) chunk_id: str = Field( default="", description="Unique identifier for this chunk (UUID format)", - examples=["550e8400-e29b-41d4-a716-446655440000"] + examples=["550e8400-e29b-41d4-a716-446655440000"], ) - chunk_type: str = Field( - default="", + chunk_type: ChunkType = Field( + default=ChunkType.TEXT, description="Type of chunk (e.g., 'text')", - examples=["text"] + examples=["text"], ) citations: Dict[str, str] = Field( default_factory=dict, description="Citation information as key-value pairs", - examples=[{"source": "quarterly_report.pdf"}] + examples=[{"source": "quarterly_report.pdf"}], ) - metadata: str = Field( - default="", - description="JSON string containing metadata about the chunking output", - examples=['{"page": 1}'] - ) - model_config = ConfigDict(extra='ignore') + model_config = ConfigDict(extra="ignore") class SearchIndexChunkingV1Request(BaseModel): """Request for Search Index Chunking""" + input: List[SearchIndexChunkingV1DocElement] = Field( - default_factory=list, - description="List of documents to be chunked" + default_factory=list, description="List of documents to be chunked" ) - model_config = ConfigDict(extra='ignore') + model_config = ConfigDict(extra="ignore") class SearchIndexChunkingV1Response(BaseModel): """Batch response for UDS chunking""" + output: List[SearchIndexChunkingV1Output] = Field( default_factory=list, description="Flat list of chunks from all docs" ) - model_config = ConfigDict(extra='ignore') + model_config = ConfigDict(extra="ignore") diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py index 311a51b..dd199a7 100644 --- a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +++ b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py @@ -1,74 +1,145 @@ import logging - -from langchain_text_splitters import RecursiveCharacterTextSplitter +import uuid from datacustomcode.function import Runtime from datacustomcode.function.feature_types.chunking import ( + ChunkType, + SearchIndexChunkingV1Output, SearchIndexChunkingV1Request, SearchIndexChunkingV1Response, - SearchIndexChunkOutput, - SearchIndexStatusResponse, ) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) +# Default max chunk size (can be overridden if contract adds max_characters field) +DEFAULT_MAX_CHUNK_SIZE = 50 + + +def split_text_into_chunks(text: str, max_size: int, overlap: int = 20): + """Split text into chunks with overlap, trying to break at natural boundaries. + + Tries to break at natural boundaries in order of preference: + 1. Paragraph boundaries (\\n\\n) + 2. Line boundaries (\\n) + 3. Sentence boundaries (. ! ?) + 4. Word boundaries (space) + 5. Hard cut if no good boundary found + + Args: + text: Text to split + max_size: Maximum characters per chunk + overlap: Number of characters to overlap between chunks + + Returns: + List of text chunks + """ + if len(text) <= max_size: + return [text] + + chunks = [] + start = 0 + + while start < len(text): + # Determine end position for this chunk + end = start + max_size + + if end >= len(text): + # Last chunk + chunks.append(text[start:]) + break + + # Try to find a good breaking point (in order of preference) + chunk_text = text[start:end] + break_point = None + + # Try to break at paragraph boundary (\n\n) + last_paragraph = chunk_text.rfind("\n\n") + if last_paragraph > max_size * 0.5: # Only if it's past halfway + break_point = start + last_paragraph + 2 # +2 to skip the \n\n + + # Try to break at line boundary (\n) + if break_point is None: + last_newline = chunk_text.rfind("\n") + if last_newline > max_size * 0.5: + break_point = start + last_newline + 1 + + # Try to break at sentence boundary (. ! ?) + if break_point is None: + for punct in [". ", "! ", "? "]: + last_sentence = chunk_text.rfind(punct) + if last_sentence > max_size * 0.5: + break_point = start + last_sentence + len(punct) + break + + # Try to break at word boundary (space) + if break_point is None: + last_space = chunk_text.rfind(" ") + if last_space > max_size * 0.5: + break_point = start + last_space + 1 + + # If no good breaking point, just hard cut + if break_point is None: + break_point = end + + chunks.append(text[start:break_point].strip()) + + # Move start position with overlap + start = max(break_point - overlap, start + 1) + + return chunks + def function( request: SearchIndexChunkingV1Request, runtime: Runtime ) -> SearchIndexChunkingV1Response: - print(f"Received {len(request.input)} documents to chunk") - print(f"Max characters per chunk: {request.max_characters}") + """Chunk documents into smaller pieces for search indexing. + + Args: + request: SearchIndexChunkingV1Request with input documents + runtime: Runtime context (unused but required by contract) - # Initialize RecursiveCharacterTextSplitter - # It tries to split on: "\n\n", "\n", " ", "" (in that order) - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=request.max_characters, - chunk_overlap=20, # Small overlap to maintain context - length_function=len, - separators=["\n\n", "\n", " ", ""], - ) + Returns: + SearchIndexChunkingV1Response with chunked output + """ + print(f"Received {len(request.input)} documents to chunk") chunks = [] - chunk_id = 1 + seq_no = 1 + + # Use default max chunk size + max_chunk_size = DEFAULT_MAX_CHUNK_SIZE # Process each document for doc_idx, doc in enumerate(request.input): text = doc.text - metadata = doc.metadata if hasattr(doc.metadata, "__iter__") else {} + metadata = doc.metadata - print(f"📄 Processing document {doc_idx + 1}: {len(text)} characters") + print(f"Processing document {doc_idx + 1}: {len(text)} characters") - # Split the text using RecursiveCharacterTextSplitter - text_chunks = text_splitter.split_text(text) + # Split the text using our simple chunking algorithm + text_chunks = split_text_into_chunks(text, max_chunk_size, overlap=20) # Create chunk outputs for chunk_text in text_chunks: - chunk_output = SearchIndexChunkOutput( - chunk_id=f"chunk_{chunk_id:04d}", - chunk_type="text", + # Create citations from source_dmo_fields if available + citations = {} + if metadata.source_dmo_fields: + for key, value in metadata.source_dmo_fields.items(): + citations[key] = str(value) + + chunk_output = SearchIndexChunkingV1Output( + chunk_id=str(uuid.uuid4()), + chunk_type=ChunkType.TEXT, text=chunk_text.strip(), - seq_no=chunk_id, - metadata={ - k: str(v) for k, v in (dict(metadata) if metadata else {}).items() - }, - tag_metadata={}, - citations={}, + seq_no=seq_no, + citations=citations, ) chunks.append(chunk_output) - print(f"Chunk {chunk_id}: {len(chunk_text)} chars") - chunk_id += 1 + print(f"Chunk {seq_no}: {len(chunk_text)} chars") + seq_no += 1 print(f"Generated {len(chunks)} chunks total") - return SearchIndexChunkingV1Response( - output=chunks, - status=SearchIndexStatusResponse( - status_type="success", - status_message=( - f"Successfully chunked {len(request.input)} documents " - f"into {len(chunks)} chunks" - ), - ), - ) + return SearchIndexChunkingV1Response(output=chunks) diff --git a/src/datacustomcode/templates/function/chunking/requirements.txt b/src/datacustomcode/templates/function/chunking/requirements.txt index 7f5990c..219536a 100644 --- a/src/datacustomcode/templates/function/chunking/requirements.txt +++ b/src/datacustomcode/templates/function/chunking/requirements.txt @@ -1,2 +1 @@ # Packages required for the chunking function -langchain-text-splitters>=0.3.0 From cb102376835a6bc8637a250cdde7d7e6b93baff6 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Sat, 2 May 2026 11:45:20 +0530 Subject: [PATCH 18/19] Updating DocumentType value --- .../function/feature_types/chunking.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/datacustomcode/function/feature_types/chunking.py b/src/datacustomcode/function/feature_types/chunking.py index 1c1a28a..fe0a239 100644 --- a/src/datacustomcode/function/feature_types/chunking.py +++ b/src/datacustomcode/function/feature_types/chunking.py @@ -33,13 +33,13 @@ class DocumentType(str, Enum): """Document type enumeration""" - TEXT = "Text" - TITLE = "Title" - TABLE = "Table" - IMAGE = "Image" - LIST_ITEM = "ListItem" - CODE_SNIPPET = "CodeSnippet" - PAGE_METADATA = "PageMetadata" + TEXT = "text" + TITLE = "title" + TABLE = "table" + IMAGE = "image" + LIST_ITEM = "list_item" + CODE_SNIPPET = "code_snippet" + PAGE_METADATA = "page_metadata" class ChunkType(str, Enum): @@ -90,7 +90,7 @@ class SearchIndexChunkingV1Metadata(BaseModel): """Metadata for input documents""" type: DocumentType = Field( - default=DocumentType.TEXT, description="Document type (Text)", examples=["Text"] + default=DocumentType.TEXT, description="Document type (text)", examples=["text"] ) transcript_fields: SearchIndexChunkingV1TranscriptField = Field( default_factory=SearchIndexChunkingV1TranscriptField, From 2c8f040c56ad14e0d615aee5066dad1c09c2df09 Mon Sep 17 00:00:00 2001 From: Rita Agarwala Date: Mon, 4 May 2026 13:15:38 +0530 Subject: [PATCH 19/19] Making text_as_html optional as it can be null --- src/datacustomcode/function/feature_types/chunking.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/datacustomcode/function/feature_types/chunking.py b/src/datacustomcode/function/feature_types/chunking.py index fe0a239..1425921 100644 --- a/src/datacustomcode/function/feature_types/chunking.py +++ b/src/datacustomcode/function/feature_types/chunking.py @@ -20,6 +20,7 @@ from typing import ( Dict, List, + Optional, Union, ) @@ -103,8 +104,8 @@ class SearchIndexChunkingV1Metadata(BaseModel): description="Page number in the source document (0-based)", examples=[1], ) - text_as_html: str = Field( - default="", + text_as_html: Optional[str] = Field( + default=None, description="HTML representation of the document text", examples=["

Online Remittance Instructions

"], )