Ground Truth Dataset Format
Ground truth datasets validate that a language handler extracts the correct entities from known source files. They are the gold standard for regression testing.
Directory Layout​
tests/fixtures/code_intel/{language}/
sample_module.{ext} # Source file
expected_entities.json # Expected extraction results
Schema​
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"required": ["file", "language", "entities"],
"properties": {
"file": {
"type": "string",
"description": "Filename of the source fixture"
},
"language": {
"type": "string",
"description": "Language identifier matching LANGUAGE_HANDLERS key"
},
"module_path": {
"type": "string",
"description": "Module path passed to parse()",
"default": ""
},
"entities": {
"type": "array",
"items": {
"type": "object",
"required": ["name", "entity_type"],
"properties": {
"name": { "type": "string" },
"entity_type": {
"type": "string",
"enum": ["class", "function", "method", "module_variable", "enum", "interface", "type_alias"]
},
"qualified_name": { "type": "string" },
"line_start": { "type": "integer" },
"line_end": { "type": "integer" },
"signature": { "type": "string" },
"docstring": { "type": ["string", "null"] },
"parent_class": { "type": ["string", "null"] },
"decorators": { "type": "array", "items": { "type": "string" } },
"base_classes": { "type": "array", "items": { "type": "string" } },
"parameters": { "type": "array", "items": { "type": "string" } },
"is_async": { "type": "boolean" },
"is_static": { "type": "boolean" },
"enum_members": { "type": "array", "items": { "type": "string" } }
}
}
}
}
}
Example: Python​
Source file: tests/fixtures/code_intel/python/sample_module.py
"""Sample module for ground truth validation."""
MAX_RETRIES = 3
class UserService:
"""Manages user operations."""
def create_user(self, name: str) -> dict:
"""Create a new user."""
return {"name": name}
@staticmethod
def validate_email(email: str) -> bool:
return "@" in email
async def fetch_data(url: str) -> bytes:
"""Fetch data from URL."""
pass
Expected entities: tests/fixtures/code_intel/python/expected_entities.json
{
"file": "sample_module.py",
"language": "python",
"module_path": "sample_module",
"entities": [
{
"name": "MAX_RETRIES",
"entity_type": "module_variable",
"qualified_name": "sample_module.MAX_RETRIES",
"line_start": 3,
"signature": "MAX_RETRIES = 3"
},
{
"name": "UserService",
"entity_type": "class",
"qualified_name": "sample_module.UserService",
"line_start": 5,
"docstring": "Manages user operations.",
"signature": "class UserService"
},
{
"name": "create_user",
"entity_type": "method",
"qualified_name": "sample_module.UserService.create_user",
"parent_class": "UserService",
"parameters": ["self", "name"],
"docstring": "Create a new user."
},
{
"name": "validate_email",
"entity_type": "method",
"qualified_name": "sample_module.UserService.validate_email",
"parent_class": "UserService",
"is_static": true,
"decorators": ["staticmethod"]
},
{
"name": "fetch_data",
"entity_type": "function",
"qualified_name": "sample_module.fetch_data",
"is_async": true,
"parameters": ["url"],
"docstring": "Fetch data from URL."
}
]
}
Validation Test Pattern​
import json
from pathlib import Path
import pytest
from faos_api.knowledge_fabric.code.parsers.handler_registry import get_handler
FIXTURES = Path(__file__).parent.parent.parent / "fixtures" / "code_intel"
def load_ground_truth(language: str):
gt_dir = FIXTURES / language
source = (gt_dir / json.loads((gt_dir / "expected_entities.json").read_text())["file"]).read_text()
expected = json.loads((gt_dir / "expected_entities.json").read_text())
return source, expected
@pytest.mark.parametrize("language", ["python", "java", "go"])
def test_ground_truth(language: str):
handler = get_handler(language)
assert handler is not None
source, expected = load_ground_truth(language)
entities = handler.parse(
source,
module_path=expected.get("module_path", ""),
)
entity_map = {e.name: e for e in entities}
for exp in expected["entities"]:
assert exp["name"] in entity_map, f"Missing entity: {exp['name']}"
actual = entity_map[exp["name"]]
assert actual.entity_type.value == exp["entity_type"]
if "qualified_name" in exp:
assert actual.qualified_name == exp["qualified_name"]
if "is_async" in exp:
assert actual.is_async == exp["is_async"]
if "is_static" in exp:
assert actual.is_static == exp["is_static"]
Generating Datasets​
- Write a representative source file covering all entity types the language supports.
- Run your handler and inspect the output:
handler = get_handler("your_language")entities = handler.parse(source, module_path="sample")for e in entities:print(f"{e.name}: {e.entity_type.value} @ L{e.line_start}")
- Manually verify the output matches expectations.
- Save as JSON following the schema above.