Ground Truth Dataset Format

Ground truth datasets validate that a language handler extracts the correct entities from known source files. They are the gold standard for regression testing.

Directory Layout

tests/fixtures/code_intel/{language}/
    sample_module.{ext}          # Source file
    expected_entities.json       # Expected extraction results

Schema

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "type": "object",
  "required": ["file", "language", "entities"],
  "properties": {
    "file": {
      "type": "string",
      "description": "Filename of the source fixture"
    },
    "language": {
      "type": "string",
      "description": "Language identifier matching LANGUAGE_HANDLERS key"
    },
    "module_path": {
      "type": "string",
      "description": "Module path passed to parse()",
      "default": ""
    },
    "entities": {
      "type": "array",
      "items": {
        "type": "object",
        "required": ["name", "entity_type"],
        "properties": {
          "name": { "type": "string" },
          "entity_type": {
            "type": "string",
            "enum": ["class", "function", "method", "module_variable", "enum", "interface", "type_alias"]
          },
          "qualified_name": { "type": "string" },
          "line_start": { "type": "integer" },
          "line_end": { "type": "integer" },
          "signature": { "type": "string" },
          "docstring": { "type": ["string", "null"] },
          "parent_class": { "type": ["string", "null"] },
          "decorators": { "type": "array", "items": { "type": "string" } },
          "base_classes": { "type": "array", "items": { "type": "string" } },
          "parameters": { "type": "array", "items": { "type": "string" } },
          "is_async": { "type": "boolean" },
          "is_static": { "type": "boolean" },
          "enum_members": { "type": "array", "items": { "type": "string" } }
        }
      }
    }
  }
}

Example: Python

Source file: tests/fixtures/code_intel/python/sample_module.py

"""Sample module for ground truth validation."""

MAX_RETRIES = 3

class UserService:
    """Manages user operations."""

    def create_user(self, name: str) -> dict:
        """Create a new user."""
        return {"name": name}

    @staticmethod
    def validate_email(email: str) -> bool:
        return "@" in email

async def fetch_data(url: str) -> bytes:
    """Fetch data from URL."""
    pass

Expected entities: tests/fixtures/code_intel/python/expected_entities.json

{
  "file": "sample_module.py",
  "language": "python",
  "module_path": "sample_module",
  "entities": [
    {
      "name": "MAX_RETRIES",
      "entity_type": "module_variable",
      "qualified_name": "sample_module.MAX_RETRIES",
      "line_start": 3,
      "signature": "MAX_RETRIES = 3"
    },
    {
      "name": "UserService",
      "entity_type": "class",
      "qualified_name": "sample_module.UserService",
      "line_start": 5,
      "docstring": "Manages user operations.",
      "signature": "class UserService"
    },
    {
      "name": "create_user",
      "entity_type": "method",
      "qualified_name": "sample_module.UserService.create_user",
      "parent_class": "UserService",
      "parameters": ["self", "name"],
      "docstring": "Create a new user."
    },
    {
      "name": "validate_email",
      "entity_type": "method",
      "qualified_name": "sample_module.UserService.validate_email",
      "parent_class": "UserService",
      "is_static": true,
      "decorators": ["staticmethod"]
    },
    {
      "name": "fetch_data",
      "entity_type": "function",
      "qualified_name": "sample_module.fetch_data",
      "is_async": true,
      "parameters": ["url"],
      "docstring": "Fetch data from URL."
    }
  ]
}

Validation Test Pattern

import json
from pathlib import Path

import pytest

from faos_api.knowledge_fabric.code.parsers.handler_registry import get_handler


FIXTURES = Path(__file__).parent.parent.parent / "fixtures" / "code_intel"


def load_ground_truth(language: str):
    gt_dir = FIXTURES / language
    source = (gt_dir / json.loads((gt_dir / "expected_entities.json").read_text())["file"]).read_text()
    expected = json.loads((gt_dir / "expected_entities.json").read_text())
    return source, expected


@pytest.mark.parametrize("language", ["python", "java", "go"])
def test_ground_truth(language: str):
    handler = get_handler(language)
    assert handler is not None

    source, expected = load_ground_truth(language)
    entities = handler.parse(
        source,
        module_path=expected.get("module_path", ""),
    )

    entity_map = {e.name: e for e in entities}

    for exp in expected["entities"]:
        assert exp["name"] in entity_map, f"Missing entity: {exp['name']}"
        actual = entity_map[exp["name"]]
        assert actual.entity_type.value == exp["entity_type"]

        if "qualified_name" in exp:
            assert actual.qualified_name == exp["qualified_name"]
        if "is_async" in exp:
            assert actual.is_async == exp["is_async"]
        if "is_static" in exp:
            assert actual.is_static == exp["is_static"]

Generating Datasets

Write a representative source file covering all entity types the language supports.

Run your handler and inspect the output:

handler = get_handler("your_language")
entities = handler.parse(source, module_path="sample")
for e in entities:
    print(f"{e.name}: {e.entity_type.value} @ L{e.line_start}")

Manually verify the output matches expectations.
Save as JSON following the schema above.

Directory Layout​

Schema​

Example: Python​

Validation Test Pattern​

Generating Datasets​

Directory Layout

Schema

Example: Python

Validation Test Pattern

Generating Datasets