Skip to content

Containers

BaseDocument dataclass

Bases: DataContainer[str]

Base document container for raw text content.

Source code in healthchain/io/containers/base.py
@dataclass
class BaseDocument(DataContainer[str]):
    """Base document container for raw text content."""

    data: str
    text: str = field(init=False)

    def __post_init__(self):
        self.text = self.data

    def char_count(self) -> int:
        return len(self.text)

DataContainer dataclass

Bases: Generic[T]

A generic container for data.

This class represents a container for data with a specific type T.

ATTRIBUTE DESCRIPTION
data

The data stored in the container.

TYPE: T

METHOD DESCRIPTION
to_dict

Converts the container's data to a dictionary.

to_json

Converts the container's data to a JSON string.

from_dict

Dict[str, Any]) -> "DataContainer": Creates a DataContainer instance from a dictionary.

from_json

str) -> "DataContainer": Creates a DataContainer instance from a JSON string.

Source code in healthchain/io/containers/base.py
@dataclass
class DataContainer(Generic[T]):
    """
    A generic container for data.

    This class represents a container for data with a specific type T.

    Attributes:
        data (T): The data stored in the container.

    Methods:
        to_dict() -> Dict[str, Any]:
            Converts the container's data to a dictionary.

        to_json() -> str:
            Converts the container's data to a JSON string.

        from_dict(cls, data: Dict[str, Any]) -> "DataContainer":
            Creates a DataContainer instance from a dictionary.

        from_json(cls, json_str: str) -> "DataContainer":
            Creates a DataContainer instance from a JSON string.
    """

    data: T

    def to_dict(self) -> Dict[str, Any]:
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def to_json(self) -> str:
        return json.dumps(self.to_dict())

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "DataContainer":
        return cls(**data)

    @classmethod
    def from_json(cls, json_str: str) -> "DataContainer":
        return cls.from_dict(json.loads(json_str))

Dataset dataclass

Bases: DataContainer[DataFrame]

A container for tabular data optimized for ML inference, lightweight wrapper around a pandas DataFrame.

ATTRIBUTE DESCRIPTION
data

The pandas DataFrame containing the dataset.

TYPE: T

metadata

Dict for storing pipeline results (predictions, probabilities, etc.)

TYPE: Dict[str, Any]

METHOD DESCRIPTION
from_csv

Load Dataset from CSV.

from_dict

Load Dataset from dict.

from_fhir_bundle

Create Dataset from FHIR Bundle and schema.

to_csv

Save Dataset to CSV.

to_risk_assessment

Convert predictions to FHIR RiskAssessment.

Source code in healthchain/io/containers/dataset.py
@dataclass
class Dataset(DataContainer[pd.DataFrame]):
    """
    A container for tabular data optimized for ML inference, lightweight wrapper around a pandas DataFrame.

    Attributes:
        data: The pandas DataFrame containing the dataset.
        metadata: Dict for storing pipeline results (predictions, probabilities, etc.)

    Methods:
        from_csv: Load Dataset from CSV.
        from_dict: Load Dataset from dict.
        from_fhir_bundle: Create Dataset from FHIR Bundle and schema.
        to_csv: Save Dataset to CSV.
        to_risk_assessment: Convert predictions to FHIR RiskAssessment.
    """

    metadata: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        if not isinstance(self.data, pd.DataFrame):
            raise TypeError("data must be a pandas DataFrame")

    @property
    def columns(self) -> List[str]:
        return list(self.data.columns)

    @property
    def index(self) -> pd.Index:
        return self.data.index

    @property
    def dtypes(self) -> Dict[str, str]:
        return {col: str(dtype) for col, dtype in self.data.dtypes.items()}

    def column_count(self) -> int:
        return len(self.columns)

    def row_count(self) -> int:
        return len(self.data)

    def get_dtype(self, column: str) -> str:
        return str(self.data[column].dtype)

    def __iter__(self) -> Iterator[str]:
        return iter(self.columns)

    def __len__(self) -> int:
        return self.row_count()

    def describe(self) -> str:
        return f"Dataset with {self.column_count()} columns and {self.row_count()} rows"

    def remove_column(self, name: str) -> None:
        self.data.drop(columns=[name], inplace=True)

    @classmethod
    def from_csv(cls, path: str, **kwargs) -> "Dataset":
        return cls(pd.read_csv(path, **kwargs))

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "Dataset":
        df = pd.DataFrame(data["data"])
        return cls(df)

    def to_csv(self, path: str, **kwargs) -> None:
        self.data.to_csv(path, **kwargs)

    @classmethod
    def from_fhir_bundle(
        cls,
        bundle: Union[Bundle, Dict[str, Any]],
        schema: Union[str, Path, FeatureSchema],
        aggregation: str = "mean",
    ) -> "Dataset":
        """Create Dataset from a FHIR Bundle using a feature schema.

        Extracts features from FHIR resources according to the schema specification,
        converting FHIR data to a pandas DataFrame suitable for ML inference.

        Args:
            bundle: FHIR Bundle resource (object or dict)
            schema: FeatureSchema object, or path to YAML schema file
            aggregation: How to aggregate multiple observation values (default: "mean")
                Options: "mean", "median", "max", "min", "last" (default: "mean")

        Returns:
            Dataset container with extracted features

        Example:
            >>> from fhir.resources.bundle import Bundle
            >>> bundle = Bundle(**patient_data)
            >>> dataset = Dataset.from_fhir_bundle(
            ...     bundle,
            ...     schema="healthchain/configs/features/sepsis_vitals.yaml"
            ... )
            >>> df = dataset.data
        """
        # Load schema if path provided
        if isinstance(schema, (str, Path)):
            schema = FeatureSchema.from_yaml(schema)

        # Extract features using mapper
        mapper = FHIRFeatureMapper(schema)
        df = mapper.extract_features(bundle, aggregation=aggregation)

        return cls(df)

    def validate(
        self, schema: FeatureSchema, raise_on_error: bool = False
    ) -> ValidationResult:
        """Validate DataFrame against a feature schema.

        Checks that required features are present and have correct data types.

        Args:
            schema: FeatureSchema to validate against
            raise_on_error: Whether to raise exception on validation failure

        Returns:
            ValidationResult with validation status and details

        Raises:
            ValueError: If raise_on_error is True and validation fails

        Example:
            >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
            >>> result = dataset.validate(schema)
            >>> if not result.valid:
            ...     print(result.errors)
        """
        result = ValidationResult(valid=True)

        # Check for missing required features
        required = schema.get_required_features()
        missing = [f for f in required if f not in self.data.columns]

        for feature in missing:
            result.add_missing_feature(feature)

        # Check data types for present features
        for feature_name, mapping in schema.features.items():
            if feature_name in self.data.columns:
                actual_dtype = str(self.data[feature_name].dtype)
                expected_dtype = mapping.dtype

                # Check for type mismatches (allow some flexibility)
                if not self._dtypes_compatible(actual_dtype, expected_dtype):
                    result.add_type_mismatch(feature_name, expected_dtype, actual_dtype)

        # Warn about optional missing features
        optional = set(schema.get_feature_names()) - set(required)
        missing_optional = [f for f in optional if f not in self.data.columns]

        for feature in missing_optional:
            result.add_warning(f"Optional feature '{feature}' is missing")

        if raise_on_error and not result.valid:
            raise ValueError(str(result))

        return result

    def _dtypes_compatible(self, actual: str, expected: str) -> bool:
        """Check if actual dtype is compatible with expected dtype.

        Args:
            actual: Actual dtype string
            expected: Expected dtype string

        Returns:
            True if dtypes are compatible
        """
        # Handle numeric types flexibly
        numeric_types = {"int64", "int32", "float64", "float32"}
        if expected in numeric_types and actual in numeric_types:
            return True

        # Exact match for non-numeric types
        return actual == expected

    def to_risk_assessment(
        self,
        outcome_code: str,
        outcome_display: str,
        outcome_system: str = "http://hl7.org/fhir/sid/icd-10",
        model_name: Optional[str] = None,
        model_version: Optional[str] = None,
        high_threshold: float = 0.7,
        moderate_threshold: float = 0.4,
        predictions: Optional[np.ndarray] = None,
        probabilities: Optional[np.ndarray] = None,
    ) -> List[RiskAssessment]:
        """Convert model predictions to FHIR RiskAssessment resources.

        Creates RiskAssessment resources from ML model output, suitable for
        including in FHIR Bundles or sending to FHIR servers.

        Args:
            outcome_code: Code for the predicted outcome (e.g., "A41.9" for sepsis)
            outcome_display: Display text for the outcome (e.g., "Sepsis")
            outcome_system: Code system for the outcome (default: ICD-10)
            model_name: Name of the ML model (optional)
            model_version: Version of the ML model (optional)
            high_threshold: Threshold for high risk (default: 0.7)
            moderate_threshold: Threshold for moderate risk (default: 0.4)
            predictions: Binary predictions array (0/1). Defaults to metadata["predictions"]
            probabilities: Probability scores array (0-1). Defaults to metadata["probabilities"]

        Returns:
            List of RiskAssessment resources, one per patient

        Example:
            >>> risk_assessments = dataset.to_risk_assessment(
            ...     outcome_code="A41.9",
            ...     outcome_display="Sepsis, unspecified",
            ...     model_name="RandomForest",
            ...     model_version="1.0"
            ... )
        """
        # Fall back to metadata if not provided
        if predictions is None:
            predictions = self.metadata.get("predictions")
        if probabilities is None:
            probabilities = self.metadata.get("probabilities")

        if predictions is None or probabilities is None:
            raise ValueError(
                "predictions and probabilities must be provided or available in metadata"
            )

        if len(predictions) != len(self.data):
            raise ValueError(
                f"Predictions length ({len(predictions)}) must match "
                f"DataFrame length ({len(self.data)})"
            )

        if len(probabilities) != len(self.data):
            raise ValueError(
                f"Probabilities length ({len(probabilities)}) must match "
                f"DataFrame length ({len(self.data)})"
            )

        risk_assessments = []

        # Get patient references
        if "patient_ref" not in self.data.columns:
            raise ValueError("DataFrame must have 'patient_ref' column")

        for idx, row in self.data.iterrows():
            patient_ref = row["patient_ref"]
            prediction = int(predictions[idx])
            probability = float(probabilities[idx])

            # Determine qualitative risk
            if probability >= high_threshold:
                qualitative_risk = "high"
            elif probability >= moderate_threshold:
                qualitative_risk = "moderate"
            else:
                qualitative_risk = "low"

            # Build prediction dict
            prediction_dict = {
                "outcome": {
                    "code": outcome_code,
                    "display": outcome_display,
                    "system": outcome_system,
                },
                "probability": probability,
                "qualitative_risk": qualitative_risk,
            }

            # Create method CodeableConcept if model info provided
            method = None
            if model_name:
                method = create_single_codeable_concept(
                    code=model_name,
                    display=f"{model_name} v{model_version}"
                    if model_version
                    else model_name,
                    system="https://healthchain.github.io/ml-models",
                )

            # Create comment with prediction details
            comment = (
                f"ML prediction: {'Positive' if prediction == 1 else 'Negative'} "
                f"(probability: {probability:.2%}, risk: {qualitative_risk})"
            )

            # Create RiskAssessment
            risk_assessment = create_risk_assessment_from_prediction(
                subject=patient_ref,
                prediction=prediction_dict,
                method=method,
                comment=comment,
            )

            risk_assessments.append(risk_assessment)

        return risk_assessments

from_fhir_bundle(bundle, schema, aggregation='mean') classmethod

Create Dataset from a FHIR Bundle using a feature schema.

Extracts features from FHIR resources according to the schema specification, converting FHIR data to a pandas DataFrame suitable for ML inference.

PARAMETER DESCRIPTION
bundle

FHIR Bundle resource (object or dict)

TYPE: Union[Bundle, Dict[str, Any]]

schema

FeatureSchema object, or path to YAML schema file

TYPE: Union[str, Path, FeatureSchema]

aggregation

How to aggregate multiple observation values (default: "mean") Options: "mean", "median", "max", "min", "last" (default: "mean")

TYPE: str DEFAULT: 'mean'

RETURNS DESCRIPTION
Dataset

Dataset container with extracted features

Example

from fhir.resources.bundle import Bundle bundle = Bundle(**patient_data) dataset = Dataset.from_fhir_bundle( ... bundle, ... schema="healthchain/configs/features/sepsis_vitals.yaml" ... ) df = dataset.data

Source code in healthchain/io/containers/dataset.py
@classmethod
def from_fhir_bundle(
    cls,
    bundle: Union[Bundle, Dict[str, Any]],
    schema: Union[str, Path, FeatureSchema],
    aggregation: str = "mean",
) -> "Dataset":
    """Create Dataset from a FHIR Bundle using a feature schema.

    Extracts features from FHIR resources according to the schema specification,
    converting FHIR data to a pandas DataFrame suitable for ML inference.

    Args:
        bundle: FHIR Bundle resource (object or dict)
        schema: FeatureSchema object, or path to YAML schema file
        aggregation: How to aggregate multiple observation values (default: "mean")
            Options: "mean", "median", "max", "min", "last" (default: "mean")

    Returns:
        Dataset container with extracted features

    Example:
        >>> from fhir.resources.bundle import Bundle
        >>> bundle = Bundle(**patient_data)
        >>> dataset = Dataset.from_fhir_bundle(
        ...     bundle,
        ...     schema="healthchain/configs/features/sepsis_vitals.yaml"
        ... )
        >>> df = dataset.data
    """
    # Load schema if path provided
    if isinstance(schema, (str, Path)):
        schema = FeatureSchema.from_yaml(schema)

    # Extract features using mapper
    mapper = FHIRFeatureMapper(schema)
    df = mapper.extract_features(bundle, aggregation=aggregation)

    return cls(df)

to_risk_assessment(outcome_code, outcome_display, outcome_system='http://hl7.org/fhir/sid/icd-10', model_name=None, model_version=None, high_threshold=0.7, moderate_threshold=0.4, predictions=None, probabilities=None)

Convert model predictions to FHIR RiskAssessment resources.

Creates RiskAssessment resources from ML model output, suitable for including in FHIR Bundles or sending to FHIR servers.

PARAMETER DESCRIPTION
outcome_code

Code for the predicted outcome (e.g., "A41.9" for sepsis)

TYPE: str

outcome_display

Display text for the outcome (e.g., "Sepsis")

TYPE: str

outcome_system

Code system for the outcome (default: ICD-10)

TYPE: str DEFAULT: 'http://hl7.org/fhir/sid/icd-10'

model_name

Name of the ML model (optional)

TYPE: Optional[str] DEFAULT: None

model_version

Version of the ML model (optional)

TYPE: Optional[str] DEFAULT: None

high_threshold

Threshold for high risk (default: 0.7)

TYPE: float DEFAULT: 0.7

moderate_threshold

Threshold for moderate risk (default: 0.4)

TYPE: float DEFAULT: 0.4

predictions

Binary predictions array (0/1). Defaults to metadata["predictions"]

TYPE: Optional[ndarray] DEFAULT: None

probabilities

Probability scores array (0-1). Defaults to metadata["probabilities"]

TYPE: Optional[ndarray] DEFAULT: None

RETURNS DESCRIPTION
List[RiskAssessment]

List of RiskAssessment resources, one per patient

Example

risk_assessments = dataset.to_risk_assessment( ... outcome_code="A41.9", ... outcome_display="Sepsis, unspecified", ... model_name="RandomForest", ... model_version="1.0" ... )

Source code in healthchain/io/containers/dataset.py
def to_risk_assessment(
    self,
    outcome_code: str,
    outcome_display: str,
    outcome_system: str = "http://hl7.org/fhir/sid/icd-10",
    model_name: Optional[str] = None,
    model_version: Optional[str] = None,
    high_threshold: float = 0.7,
    moderate_threshold: float = 0.4,
    predictions: Optional[np.ndarray] = None,
    probabilities: Optional[np.ndarray] = None,
) -> List[RiskAssessment]:
    """Convert model predictions to FHIR RiskAssessment resources.

    Creates RiskAssessment resources from ML model output, suitable for
    including in FHIR Bundles or sending to FHIR servers.

    Args:
        outcome_code: Code for the predicted outcome (e.g., "A41.9" for sepsis)
        outcome_display: Display text for the outcome (e.g., "Sepsis")
        outcome_system: Code system for the outcome (default: ICD-10)
        model_name: Name of the ML model (optional)
        model_version: Version of the ML model (optional)
        high_threshold: Threshold for high risk (default: 0.7)
        moderate_threshold: Threshold for moderate risk (default: 0.4)
        predictions: Binary predictions array (0/1). Defaults to metadata["predictions"]
        probabilities: Probability scores array (0-1). Defaults to metadata["probabilities"]

    Returns:
        List of RiskAssessment resources, one per patient

    Example:
        >>> risk_assessments = dataset.to_risk_assessment(
        ...     outcome_code="A41.9",
        ...     outcome_display="Sepsis, unspecified",
        ...     model_name="RandomForest",
        ...     model_version="1.0"
        ... )
    """
    # Fall back to metadata if not provided
    if predictions is None:
        predictions = self.metadata.get("predictions")
    if probabilities is None:
        probabilities = self.metadata.get("probabilities")

    if predictions is None or probabilities is None:
        raise ValueError(
            "predictions and probabilities must be provided or available in metadata"
        )

    if len(predictions) != len(self.data):
        raise ValueError(
            f"Predictions length ({len(predictions)}) must match "
            f"DataFrame length ({len(self.data)})"
        )

    if len(probabilities) != len(self.data):
        raise ValueError(
            f"Probabilities length ({len(probabilities)}) must match "
            f"DataFrame length ({len(self.data)})"
        )

    risk_assessments = []

    # Get patient references
    if "patient_ref" not in self.data.columns:
        raise ValueError("DataFrame must have 'patient_ref' column")

    for idx, row in self.data.iterrows():
        patient_ref = row["patient_ref"]
        prediction = int(predictions[idx])
        probability = float(probabilities[idx])

        # Determine qualitative risk
        if probability >= high_threshold:
            qualitative_risk = "high"
        elif probability >= moderate_threshold:
            qualitative_risk = "moderate"
        else:
            qualitative_risk = "low"

        # Build prediction dict
        prediction_dict = {
            "outcome": {
                "code": outcome_code,
                "display": outcome_display,
                "system": outcome_system,
            },
            "probability": probability,
            "qualitative_risk": qualitative_risk,
        }

        # Create method CodeableConcept if model info provided
        method = None
        if model_name:
            method = create_single_codeable_concept(
                code=model_name,
                display=f"{model_name} v{model_version}"
                if model_version
                else model_name,
                system="https://healthchain.github.io/ml-models",
            )

        # Create comment with prediction details
        comment = (
            f"ML prediction: {'Positive' if prediction == 1 else 'Negative'} "
            f"(probability: {probability:.2%}, risk: {qualitative_risk})"
        )

        # Create RiskAssessment
        risk_assessment = create_risk_assessment_from_prediction(
            subject=patient_ref,
            prediction=prediction_dict,
            method=method,
            comment=comment,
        )

        risk_assessments.append(risk_assessment)

    return risk_assessments

validate(schema, raise_on_error=False)

Validate DataFrame against a feature schema.

Checks that required features are present and have correct data types.

PARAMETER DESCRIPTION
schema

FeatureSchema to validate against

TYPE: FeatureSchema

raise_on_error

Whether to raise exception on validation failure

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
ValidationResult

ValidationResult with validation status and details

RAISES DESCRIPTION
ValueError

If raise_on_error is True and validation fails

Example

schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml") result = dataset.validate(schema) if not result.valid: ... print(result.errors)

Source code in healthchain/io/containers/dataset.py
def validate(
    self, schema: FeatureSchema, raise_on_error: bool = False
) -> ValidationResult:
    """Validate DataFrame against a feature schema.

    Checks that required features are present and have correct data types.

    Args:
        schema: FeatureSchema to validate against
        raise_on_error: Whether to raise exception on validation failure

    Returns:
        ValidationResult with validation status and details

    Raises:
        ValueError: If raise_on_error is True and validation fails

    Example:
        >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
        >>> result = dataset.validate(schema)
        >>> if not result.valid:
        ...     print(result.errors)
    """
    result = ValidationResult(valid=True)

    # Check for missing required features
    required = schema.get_required_features()
    missing = [f for f in required if f not in self.data.columns]

    for feature in missing:
        result.add_missing_feature(feature)

    # Check data types for present features
    for feature_name, mapping in schema.features.items():
        if feature_name in self.data.columns:
            actual_dtype = str(self.data[feature_name].dtype)
            expected_dtype = mapping.dtype

            # Check for type mismatches (allow some flexibility)
            if not self._dtypes_compatible(actual_dtype, expected_dtype):
                result.add_type_mismatch(feature_name, expected_dtype, actual_dtype)

    # Warn about optional missing features
    optional = set(schema.get_feature_names()) - set(required)
    missing_optional = [f for f in optional if f not in self.data.columns]

    for feature in missing_optional:
        result.add_warning(f"Optional feature '{feature}' is missing")

    if raise_on_error and not result.valid:
        raise ValueError(str(result))

    return result

Document dataclass

Bases: BaseDocument

Main document container for processing textual and clinical data in HealthChain.

The Document class is the primary structure used throughout annotation and analytics pipelines, accumulating transformations, extractions, and results from each stage. It seamlessly integrates raw text, NLP annotations, FHIR resources, clinical decision support (CDS) results, and ML model outputs in one object.

Features
  • Accepts text, FHIR Bundles/resources, or lists of FHIR resources as input.
  • Provides basic tokenization and supports integration with NLP models (spaCy, transformers).
  • Stores and manipulates clinical FHIR data via the .fhir property (access to bundles, problem lists, meds, allergies, etc.).
  • Encapsulates CDS Hooks-style decision support cards and suggested actions via the .cds property.
  • Stores outputs from external ML/LLM models: HuggingFace, LangChain, etc.
ATTRIBUTE DESCRIPTION
nlp

NLP output (tokens, entities, embeddings, spaCy doc)

TYPE: NlpAnnotations

fhir

FHIR resources and context (problem list, medication, allergy, etc.)

TYPE: FhirData

cds

Clinical decision support (cards and actions)

TYPE: CdsAnnotations

models

Results from ML/LLM models (HuggingFace, LangChain, etc.)

TYPE: ModelOutputs

text

The text content of the document (if available).

TYPE: str

data

The original input supplied (raw text, Bundle, resource, or list of resources)

TYPE: str

Usage example

doc = Document(data="Patient has hypertension") doc.nlp._tokens ['Patient', 'has', 'hypertension'] doc.fhir.problem_list = [Condition(...)] doc.cds.cards = [Card(...)] doc.models.huggingface_results = ... for token in doc: ... print(token)

Inherits from

BaseDocument

Source code in healthchain/io/containers/document.py
@dataclass
class Document(BaseDocument):
    """
    Main document container for processing textual and clinical data in HealthChain.

    The Document class is the primary structure used throughout annotation and analytics
    pipelines, accumulating transformations, extractions, and results from each stage. It
    seamlessly integrates raw text, NLP annotations, FHIR resources, clinical decision
    support (CDS) results, and ML model outputs in one object.

    Features:
        - Accepts text, FHIR Bundles/resources, or lists of FHIR resources as input.
        - Provides basic tokenization and supports integration with NLP models (spaCy, transformers).
        - Stores and manipulates clinical FHIR data via the .fhir property (access to bundles, problem lists, meds, allergies, etc.).
        - Encapsulates CDS Hooks-style decision support cards and suggested actions via the .cds property.
        - Stores outputs from external ML/LLM models: HuggingFace, LangChain, etc.

    Attributes:
        nlp (NlpAnnotations): NLP output (tokens, entities, embeddings, spaCy doc)
        fhir (FhirData): FHIR resources and context (problem list, medication, allergy, etc.)
        cds (CdsAnnotations): Clinical decision support (cards and actions)
        models (ModelOutputs): Results from ML/LLM models (HuggingFace, LangChain, etc.)
        text (str): The text content of the document (if available).
        data: The original input supplied (raw text, Bundle, resource, or list of resources)

    Usage example:
        >>> doc = Document(data="Patient has hypertension")
        >>> doc.nlp._tokens
        ['Patient', 'has', 'hypertension']
        >>> doc.fhir.problem_list = [Condition(...)]
        >>> doc.cds.cards = [Card(...)]
        >>> doc.models.huggingface_results = ...
        >>> for token in doc:
        ...     print(token)

    Inherits from:
        BaseDocument
    """

    _nlp: NlpAnnotations = field(default_factory=NlpAnnotations)
    _fhir: FhirData = field(default_factory=FhirData)
    _cds: CdsAnnotations = field(default_factory=CdsAnnotations)
    _models: ModelOutputs = field(default_factory=ModelOutputs)

    @property
    def nlp(self) -> NlpAnnotations:
        return self._nlp

    @property
    def fhir(self) -> FhirData:
        return self._fhir

    @property
    def cds(self) -> CdsAnnotations:
        return self._cds

    @property
    def models(self) -> ModelOutputs:
        return self._models

    def __post_init__(self):
        """
        Post-initialization setup to process textual or FHIR data.

        - If input data is a FHIR Bundle, stores it and extracts OperationOutcome and Provenance resources.
        - If input data is a list of FHIR resources, wraps them in a Bundle.
        - For text input, sets .text field accordingly.
        - Performs basic whitespace tokenization if necessary.
        """
        super().__post_init__()

        # Handle FHIR Bundle data
        if isinstance(self.data, Bundle):
            self._fhir._bundle = self.data

            # Extract OperationOutcome resources (operation results/errors)
            outcomes = extract_resources(self._fhir._bundle, "OperationOutcome")
            if outcomes:
                self._fhir._operation_outcomes = outcomes

            # Extract Provenance resources (data lineage/origin)
            provenances = extract_resources(self._fhir._bundle, "Provenance")
            if provenances:
                self._fhir._provenances = provenances

            self.text = ""  # No text content for bundle-only documents
        # Handle list of FHIR resources
        elif (
            isinstance(self.data, list)
            and self.data
            and isinstance(self.data[0], Resource)
        ):
            self._fhir._bundle = create_bundle()
            for resource in self.data:
                add_resource(self._fhir._bundle, resource)
            self.text = ""  # No text content for resource-only documents
        else:
            # Handle text data
            self.text = self.data if isinstance(self.data, str) else str(self.data)

        if not self._nlp._tokens and self.text:
            self._nlp._tokens = self.text.split()  # Basic tokenization if not provided

    def word_count(self) -> int:
        """
        Return the number of word tokens in the document.

        Returns:
            int: The count of tokenized words in the document.
        """
        return len(self._nlp._tokens)

    def update_problem_list_from_nlp(
        self,
        patient_ref: str = "Patient/123",
        coding_system: str = "http://snomed.info/sct",
        code_attribute: str = "cui",
    ):
        """
        Populate or update the problem list using entities extracted via NLP.

        This method looks for entities with associated medical codes and creates FHIR Condition
        resources from them. It supports a two-step process:
        1. NER: Extract entities from text (spaCy, HuggingFace, etc.)
        2. Entity Linking: Add medical codes to those entities
        3. Problem List Creation: Convert linked entities to FHIR conditions (this method)

        The method extracts from:
        1. spaCy entities with extension attributes (e.g., ent._.cui)
        2. Generic entities in the NLP annotations container (framework-agnostic)

        TODO: make this more generic and support other resource types

        Args:
            patient_ref: FHIR reference to the patient (default: "Patient/123")
            coding_system: Coding system URI for the conditions (default: SNOMED CT)
            code_attribute: Name of the attribute containing the medical code (default: "cui")

        Notes:
            - Preserves any existing problem list Conditions.
            - Supports framework-agnostic extraction (spaCy and dict entities).
            - For spaCy, looks for entity extension attribute (e.g. ent._.cui).
            - For non-spaCy, expects codes as dict keys (ent["cui"], etc.).
        """
        # Start with existing conditions to preserve them
        existing_conditions = self.fhir.problem_list.copy()
        new_conditions = []

        # 1. Extract from spaCy entities (if available)
        if self.nlp._spacy_doc and self.nlp._spacy_doc.ents:
            for ent in self.nlp._spacy_doc.ents:
                if not Span.has_extension(code_attribute):
                    logger.debug(
                        f"Extension '{code_attribute}' not found for spaCy entity {ent.text}"
                    )
                    continue

                code_value = getattr(ent._, code_attribute, None)
                if code_value is None:
                    logger.debug(
                        f"No {code_attribute} found for spaCy entity {ent.text}"
                    )
                    continue

                condition = create_condition(
                    subject=patient_ref,
                    code=code_value,
                    display=ent.text,
                    system=coding_system,
                )
                set_condition_category(condition, "problem-list-item")
                logger.debug(
                    f"Adding condition from spaCy: {condition.model_dump(exclude_none=True)}"
                )
                new_conditions.append(condition)

        # 2. Extract from generic NLP entities (framework-agnostic)
        generic_entities = self.nlp.get_entities()
        if generic_entities:
            for ent_dict in generic_entities:
                # Skip if no linked code
                code_value = ent_dict.get(code_attribute)
                if code_value is None:
                    logger.debug(
                        f"No {code_attribute} found for entity {ent_dict.get('text', 'unknown')}"
                    )
                    continue

                entity_text = ent_dict.get("text", "unknown")

                condition = create_condition(
                    subject=patient_ref,
                    code=code_value,
                    display=entity_text,
                    system=coding_system,
                )
                set_condition_category(condition, "problem-list-item")
                logger.debug(
                    f"Adding condition from entities: {condition.model_dump(exclude_none=True)}"
                )
                new_conditions.append(condition)

        # Update problem list with combined conditions (replace to avoid duplication)
        if new_conditions:
            all_conditions = existing_conditions + new_conditions
            self.fhir.add_resources(all_conditions, "Condition", replace=True)

    def __iter__(self) -> Iterator[str]:
        """
        Iterate through the document's tokens.

        Returns:
            Iterator[str]: Iterator over the document tokens.
        """
        return iter(self._nlp._tokens)

    def __len__(self) -> int:
        """
        Return the length of the document's text.

        Returns:
            int: Character length of the document text.
        """
        return len(self.text)

__iter__()

Iterate through the document's tokens.

RETURNS DESCRIPTION
Iterator[str]

Iterator[str]: Iterator over the document tokens.

Source code in healthchain/io/containers/document.py
def __iter__(self) -> Iterator[str]:
    """
    Iterate through the document's tokens.

    Returns:
        Iterator[str]: Iterator over the document tokens.
    """
    return iter(self._nlp._tokens)

__len__()

Return the length of the document's text.

RETURNS DESCRIPTION
int

Character length of the document text.

TYPE: int

Source code in healthchain/io/containers/document.py
def __len__(self) -> int:
    """
    Return the length of the document's text.

    Returns:
        int: Character length of the document text.
    """
    return len(self.text)

__post_init__()

Post-initialization setup to process textual or FHIR data.

  • If input data is a FHIR Bundle, stores it and extracts OperationOutcome and Provenance resources.
  • If input data is a list of FHIR resources, wraps them in a Bundle.
  • For text input, sets .text field accordingly.
  • Performs basic whitespace tokenization if necessary.
Source code in healthchain/io/containers/document.py
def __post_init__(self):
    """
    Post-initialization setup to process textual or FHIR data.

    - If input data is a FHIR Bundle, stores it and extracts OperationOutcome and Provenance resources.
    - If input data is a list of FHIR resources, wraps them in a Bundle.
    - For text input, sets .text field accordingly.
    - Performs basic whitespace tokenization if necessary.
    """
    super().__post_init__()

    # Handle FHIR Bundle data
    if isinstance(self.data, Bundle):
        self._fhir._bundle = self.data

        # Extract OperationOutcome resources (operation results/errors)
        outcomes = extract_resources(self._fhir._bundle, "OperationOutcome")
        if outcomes:
            self._fhir._operation_outcomes = outcomes

        # Extract Provenance resources (data lineage/origin)
        provenances = extract_resources(self._fhir._bundle, "Provenance")
        if provenances:
            self._fhir._provenances = provenances

        self.text = ""  # No text content for bundle-only documents
    # Handle list of FHIR resources
    elif (
        isinstance(self.data, list)
        and self.data
        and isinstance(self.data[0], Resource)
    ):
        self._fhir._bundle = create_bundle()
        for resource in self.data:
            add_resource(self._fhir._bundle, resource)
        self.text = ""  # No text content for resource-only documents
    else:
        # Handle text data
        self.text = self.data if isinstance(self.data, str) else str(self.data)

    if not self._nlp._tokens and self.text:
        self._nlp._tokens = self.text.split()  # Basic tokenization if not provided

update_problem_list_from_nlp(patient_ref='Patient/123', coding_system='http://snomed.info/sct', code_attribute='cui')

Populate or update the problem list using entities extracted via NLP.

This method looks for entities with associated medical codes and creates FHIR Condition resources from them. It supports a two-step process: 1. NER: Extract entities from text (spaCy, HuggingFace, etc.) 2. Entity Linking: Add medical codes to those entities 3. Problem List Creation: Convert linked entities to FHIR conditions (this method)

The method extracts from: 1. spaCy entities with extension attributes (e.g., ent._.cui) 2. Generic entities in the NLP annotations container (framework-agnostic)

TODO: make this more generic and support other resource types

PARAMETER DESCRIPTION
patient_ref

FHIR reference to the patient (default: "Patient/123")

TYPE: str DEFAULT: 'Patient/123'

coding_system

Coding system URI for the conditions (default: SNOMED CT)

TYPE: str DEFAULT: 'http://snomed.info/sct'

code_attribute

Name of the attribute containing the medical code (default: "cui")

TYPE: str DEFAULT: 'cui'

Notes
  • Preserves any existing problem list Conditions.
  • Supports framework-agnostic extraction (spaCy and dict entities).
  • For spaCy, looks for entity extension attribute (e.g. ent._.cui).
  • For non-spaCy, expects codes as dict keys (ent["cui"], etc.).
Source code in healthchain/io/containers/document.py
def update_problem_list_from_nlp(
    self,
    patient_ref: str = "Patient/123",
    coding_system: str = "http://snomed.info/sct",
    code_attribute: str = "cui",
):
    """
    Populate or update the problem list using entities extracted via NLP.

    This method looks for entities with associated medical codes and creates FHIR Condition
    resources from them. It supports a two-step process:
    1. NER: Extract entities from text (spaCy, HuggingFace, etc.)
    2. Entity Linking: Add medical codes to those entities
    3. Problem List Creation: Convert linked entities to FHIR conditions (this method)

    The method extracts from:
    1. spaCy entities with extension attributes (e.g., ent._.cui)
    2. Generic entities in the NLP annotations container (framework-agnostic)

    TODO: make this more generic and support other resource types

    Args:
        patient_ref: FHIR reference to the patient (default: "Patient/123")
        coding_system: Coding system URI for the conditions (default: SNOMED CT)
        code_attribute: Name of the attribute containing the medical code (default: "cui")

    Notes:
        - Preserves any existing problem list Conditions.
        - Supports framework-agnostic extraction (spaCy and dict entities).
        - For spaCy, looks for entity extension attribute (e.g. ent._.cui).
        - For non-spaCy, expects codes as dict keys (ent["cui"], etc.).
    """
    # Start with existing conditions to preserve them
    existing_conditions = self.fhir.problem_list.copy()
    new_conditions = []

    # 1. Extract from spaCy entities (if available)
    if self.nlp._spacy_doc and self.nlp._spacy_doc.ents:
        for ent in self.nlp._spacy_doc.ents:
            if not Span.has_extension(code_attribute):
                logger.debug(
                    f"Extension '{code_attribute}' not found for spaCy entity {ent.text}"
                )
                continue

            code_value = getattr(ent._, code_attribute, None)
            if code_value is None:
                logger.debug(
                    f"No {code_attribute} found for spaCy entity {ent.text}"
                )
                continue

            condition = create_condition(
                subject=patient_ref,
                code=code_value,
                display=ent.text,
                system=coding_system,
            )
            set_condition_category(condition, "problem-list-item")
            logger.debug(
                f"Adding condition from spaCy: {condition.model_dump(exclude_none=True)}"
            )
            new_conditions.append(condition)

    # 2. Extract from generic NLP entities (framework-agnostic)
    generic_entities = self.nlp.get_entities()
    if generic_entities:
        for ent_dict in generic_entities:
            # Skip if no linked code
            code_value = ent_dict.get(code_attribute)
            if code_value is None:
                logger.debug(
                    f"No {code_attribute} found for entity {ent_dict.get('text', 'unknown')}"
                )
                continue

            entity_text = ent_dict.get("text", "unknown")

            condition = create_condition(
                subject=patient_ref,
                code=code_value,
                display=entity_text,
                system=coding_system,
            )
            set_condition_category(condition, "problem-list-item")
            logger.debug(
                f"Adding condition from entities: {condition.model_dump(exclude_none=True)}"
            )
            new_conditions.append(condition)

    # Update problem list with combined conditions (replace to avoid duplication)
    if new_conditions:
        all_conditions = existing_conditions + new_conditions
        self.fhir.add_resources(all_conditions, "Condition", replace=True)

word_count()

Return the number of word tokens in the document.

RETURNS DESCRIPTION
int

The count of tokenized words in the document.

TYPE: int

Source code in healthchain/io/containers/document.py
def word_count(self) -> int:
    """
    Return the number of word tokens in the document.

    Returns:
        int: The count of tokenized words in the document.
    """
    return len(self._nlp._tokens)

FeatureSchema

Bases: BaseModel

Schema defining how to extract features from FHIR resources.

Source code in healthchain/io/containers/featureschema.py
class FeatureSchema(BaseModel):
    """Schema defining how to extract features from FHIR resources."""

    name: str
    version: str
    features: Dict[str, FeatureMapping] = {}
    description: Optional[str] = None
    model_info: Optional[Dict[str, Any]] = None
    metadata: Optional[Dict[str, Any]] = None

    model_config = ConfigDict(extra="allow")

    @field_validator("features", mode="before")
    @classmethod
    def convert_feature_dicts(cls, v):
        """Convert feature dicts to FeatureMapping objects if needed."""
        if v and isinstance(v, dict):
            # Check if values are dicts (need conversion) or already FeatureMapping
            if v and isinstance(list(v.values())[0], dict):
                return {
                    name: FeatureMapping.from_dict(name, mapping)
                    for name, mapping in v.items()
                }
        return v

    @classmethod
    def from_yaml(cls, path: Union[str, Path]) -> "FeatureSchema":
        """Load schema from a YAML file.

        Args:
            path: Path to the YAML file

        Returns:
            FeatureSchema instance

        Example:
            >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
        """
        path = Path(path)
        with open(path, "r") as f:
            data = yaml.safe_load(f)

        return cls.model_validate(data)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "FeatureSchema":
        """Create a FeatureSchema from a dictionary.

        Args:
            data: Dictionary containing schema configuration

        Returns:
            FeatureSchema instance
        """
        return cls.model_validate(data)

    def to_dict(self) -> Dict[str, Any]:
        """Convert schema to dictionary format.

        Returns:
            Dictionary representation of the schema
        """
        result = {
            "name": self.name,
            "version": self.version,
            "description": self.description,
            "model_info": self.model_info,
            "features": {
                name: {
                    k: v
                    for k, v in mapping.model_dump().items()
                    if k != "name" and v is not None
                }
                for name, mapping in self.features.items()
            },
        }
        if self.metadata:
            result["metadata"] = self.metadata
        return result

    def to_yaml(self, path: Union[str, Path]) -> None:
        """Save schema to a YAML file.

        Args:
            path: Path where the YAML file will be saved
        """
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)

        with open(path, "w") as f:
            yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)

    def get_feature_names(self) -> List[str]:
        """Get list of feature names in order.

        Returns:
            List of feature names
        """
        return list(self.features.keys())

    def get_required_features(self) -> List[str]:
        """Get list of required feature names.

        Returns:
            List of required feature names
        """
        return [name for name, mapping in self.features.items() if mapping.required]

    def get_features_by_resource(self, resource_type: str) -> Dict[str, FeatureMapping]:
        """Get all features mapped to a specific FHIR resource type.

        Args:
            resource_type: FHIR resource type (e.g., "Observation", "Patient")

        Returns:
            Dictionary of features for the specified resource type
        """
        return {
            name: mapping
            for name, mapping in self.features.items()
            if mapping.fhir_resource == resource_type
        }

    def get_observation_codes(self) -> Dict[str, FeatureMapping]:
        """Get all Observation features with their codes.

        Returns:
            Dictionary mapping codes to feature mappings
        """
        observations = self.get_features_by_resource("Observation")
        return {
            mapping.code: mapping for mapping in observations.values() if mapping.code
        }

    def validate_dataframe_columns(self, columns: List[str]) -> Dict[str, Any]:
        """Validate that a DataFrame has the expected columns.

        Args:
            columns: List of column names from a DataFrame

        Returns:
            Dictionary with validation results:
                - valid: bool
                - missing_required: List of missing required features
                - unexpected: List of unexpected columns
        """
        expected = set(self.get_feature_names())
        actual = set(columns)
        required = set(self.get_required_features())

        missing_required = list(required - actual)
        unexpected = list(actual - expected)

        return {
            "valid": len(missing_required) == 0,
            "missing_required": missing_required,
            "unexpected": unexpected,
            "missing_optional": list((expected - required) - actual),
        }

convert_feature_dicts(v) classmethod

Convert feature dicts to FeatureMapping objects if needed.

Source code in healthchain/io/containers/featureschema.py
@field_validator("features", mode="before")
@classmethod
def convert_feature_dicts(cls, v):
    """Convert feature dicts to FeatureMapping objects if needed."""
    if v and isinstance(v, dict):
        # Check if values are dicts (need conversion) or already FeatureMapping
        if v and isinstance(list(v.values())[0], dict):
            return {
                name: FeatureMapping.from_dict(name, mapping)
                for name, mapping in v.items()
            }
    return v

from_dict(data) classmethod

Create a FeatureSchema from a dictionary.

PARAMETER DESCRIPTION
data

Dictionary containing schema configuration

TYPE: Dict[str, Any]

RETURNS DESCRIPTION
FeatureSchema

FeatureSchema instance

Source code in healthchain/io/containers/featureschema.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "FeatureSchema":
    """Create a FeatureSchema from a dictionary.

    Args:
        data: Dictionary containing schema configuration

    Returns:
        FeatureSchema instance
    """
    return cls.model_validate(data)

from_yaml(path) classmethod

Load schema from a YAML file.

PARAMETER DESCRIPTION
path

Path to the YAML file

TYPE: Union[str, Path]

RETURNS DESCRIPTION
FeatureSchema

FeatureSchema instance

Example

schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")

Source code in healthchain/io/containers/featureschema.py
@classmethod
def from_yaml(cls, path: Union[str, Path]) -> "FeatureSchema":
    """Load schema from a YAML file.

    Args:
        path: Path to the YAML file

    Returns:
        FeatureSchema instance

    Example:
        >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
    """
    path = Path(path)
    with open(path, "r") as f:
        data = yaml.safe_load(f)

    return cls.model_validate(data)

get_feature_names()

Get list of feature names in order.

RETURNS DESCRIPTION
List[str]

List of feature names

Source code in healthchain/io/containers/featureschema.py
def get_feature_names(self) -> List[str]:
    """Get list of feature names in order.

    Returns:
        List of feature names
    """
    return list(self.features.keys())

get_features_by_resource(resource_type)

Get all features mapped to a specific FHIR resource type.

PARAMETER DESCRIPTION
resource_type

FHIR resource type (e.g., "Observation", "Patient")

TYPE: str

RETURNS DESCRIPTION
Dict[str, FeatureMapping]

Dictionary of features for the specified resource type

Source code in healthchain/io/containers/featureschema.py
def get_features_by_resource(self, resource_type: str) -> Dict[str, FeatureMapping]:
    """Get all features mapped to a specific FHIR resource type.

    Args:
        resource_type: FHIR resource type (e.g., "Observation", "Patient")

    Returns:
        Dictionary of features for the specified resource type
    """
    return {
        name: mapping
        for name, mapping in self.features.items()
        if mapping.fhir_resource == resource_type
    }

get_observation_codes()

Get all Observation features with their codes.

RETURNS DESCRIPTION
Dict[str, FeatureMapping]

Dictionary mapping codes to feature mappings

Source code in healthchain/io/containers/featureschema.py
def get_observation_codes(self) -> Dict[str, FeatureMapping]:
    """Get all Observation features with their codes.

    Returns:
        Dictionary mapping codes to feature mappings
    """
    observations = self.get_features_by_resource("Observation")
    return {
        mapping.code: mapping for mapping in observations.values() if mapping.code
    }

get_required_features()

Get list of required feature names.

RETURNS DESCRIPTION
List[str]

List of required feature names

Source code in healthchain/io/containers/featureschema.py
def get_required_features(self) -> List[str]:
    """Get list of required feature names.

    Returns:
        List of required feature names
    """
    return [name for name, mapping in self.features.items() if mapping.required]

to_dict()

Convert schema to dictionary format.

RETURNS DESCRIPTION
Dict[str, Any]

Dictionary representation of the schema

Source code in healthchain/io/containers/featureschema.py
def to_dict(self) -> Dict[str, Any]:
    """Convert schema to dictionary format.

    Returns:
        Dictionary representation of the schema
    """
    result = {
        "name": self.name,
        "version": self.version,
        "description": self.description,
        "model_info": self.model_info,
        "features": {
            name: {
                k: v
                for k, v in mapping.model_dump().items()
                if k != "name" and v is not None
            }
            for name, mapping in self.features.items()
        },
    }
    if self.metadata:
        result["metadata"] = self.metadata
    return result

to_yaml(path)

Save schema to a YAML file.

PARAMETER DESCRIPTION
path

Path where the YAML file will be saved

TYPE: Union[str, Path]

Source code in healthchain/io/containers/featureschema.py
def to_yaml(self, path: Union[str, Path]) -> None:
    """Save schema to a YAML file.

    Args:
        path: Path where the YAML file will be saved
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    with open(path, "w") as f:
        yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)

validate_dataframe_columns(columns)

Validate that a DataFrame has the expected columns.

PARAMETER DESCRIPTION
columns

List of column names from a DataFrame

TYPE: List[str]

RETURNS DESCRIPTION
Dict[str, Any]

Dictionary with validation results: - valid: bool - missing_required: List of missing required features - unexpected: List of unexpected columns

Source code in healthchain/io/containers/featureschema.py
def validate_dataframe_columns(self, columns: List[str]) -> Dict[str, Any]:
    """Validate that a DataFrame has the expected columns.

    Args:
        columns: List of column names from a DataFrame

    Returns:
        Dictionary with validation results:
            - valid: bool
            - missing_required: List of missing required features
            - unexpected: List of unexpected columns
    """
    expected = set(self.get_feature_names())
    actual = set(columns)
    required = set(self.get_required_features())

    missing_required = list(required - actual)
    unexpected = list(actual - expected)

    return {
        "valid": len(missing_required) == 0,
        "missing_required": missing_required,
        "unexpected": unexpected,
        "missing_optional": list((expected - required) - actual),
    }

base

BaseDocument dataclass

Bases: DataContainer[str]

Base document container for raw text content.

Source code in healthchain/io/containers/base.py
@dataclass
class BaseDocument(DataContainer[str]):
    """Base document container for raw text content."""

    data: str
    text: str = field(init=False)

    def __post_init__(self):
        self.text = self.data

    def char_count(self) -> int:
        return len(self.text)

DataContainer dataclass

Bases: Generic[T]

A generic container for data.

This class represents a container for data with a specific type T.

ATTRIBUTE DESCRIPTION
data

The data stored in the container.

TYPE: T

METHOD DESCRIPTION
to_dict

Converts the container's data to a dictionary.

to_json

Converts the container's data to a JSON string.

from_dict

Dict[str, Any]) -> "DataContainer": Creates a DataContainer instance from a dictionary.

from_json

str) -> "DataContainer": Creates a DataContainer instance from a JSON string.

Source code in healthchain/io/containers/base.py
@dataclass
class DataContainer(Generic[T]):
    """
    A generic container for data.

    This class represents a container for data with a specific type T.

    Attributes:
        data (T): The data stored in the container.

    Methods:
        to_dict() -> Dict[str, Any]:
            Converts the container's data to a dictionary.

        to_json() -> str:
            Converts the container's data to a JSON string.

        from_dict(cls, data: Dict[str, Any]) -> "DataContainer":
            Creates a DataContainer instance from a dictionary.

        from_json(cls, json_str: str) -> "DataContainer":
            Creates a DataContainer instance from a JSON string.
    """

    data: T

    def to_dict(self) -> Dict[str, Any]:
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def to_json(self) -> str:
        return json.dumps(self.to_dict())

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "DataContainer":
        return cls(**data)

    @classmethod
    def from_json(cls, json_str: str) -> "DataContainer":
        return cls.from_dict(json.loads(json_str))

dataset

Dataset dataclass

Bases: DataContainer[DataFrame]

A container for tabular data optimized for ML inference, lightweight wrapper around a pandas DataFrame.

ATTRIBUTE DESCRIPTION
data

The pandas DataFrame containing the dataset.

TYPE: T

metadata

Dict for storing pipeline results (predictions, probabilities, etc.)

TYPE: Dict[str, Any]

METHOD DESCRIPTION
from_csv

Load Dataset from CSV.

from_dict

Load Dataset from dict.

from_fhir_bundle

Create Dataset from FHIR Bundle and schema.

to_csv

Save Dataset to CSV.

to_risk_assessment

Convert predictions to FHIR RiskAssessment.

Source code in healthchain/io/containers/dataset.py
@dataclass
class Dataset(DataContainer[pd.DataFrame]):
    """
    A container for tabular data optimized for ML inference, lightweight wrapper around a pandas DataFrame.

    Attributes:
        data: The pandas DataFrame containing the dataset.
        metadata: Dict for storing pipeline results (predictions, probabilities, etc.)

    Methods:
        from_csv: Load Dataset from CSV.
        from_dict: Load Dataset from dict.
        from_fhir_bundle: Create Dataset from FHIR Bundle and schema.
        to_csv: Save Dataset to CSV.
        to_risk_assessment: Convert predictions to FHIR RiskAssessment.
    """

    metadata: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        if not isinstance(self.data, pd.DataFrame):
            raise TypeError("data must be a pandas DataFrame")

    @property
    def columns(self) -> List[str]:
        return list(self.data.columns)

    @property
    def index(self) -> pd.Index:
        return self.data.index

    @property
    def dtypes(self) -> Dict[str, str]:
        return {col: str(dtype) for col, dtype in self.data.dtypes.items()}

    def column_count(self) -> int:
        return len(self.columns)

    def row_count(self) -> int:
        return len(self.data)

    def get_dtype(self, column: str) -> str:
        return str(self.data[column].dtype)

    def __iter__(self) -> Iterator[str]:
        return iter(self.columns)

    def __len__(self) -> int:
        return self.row_count()

    def describe(self) -> str:
        return f"Dataset with {self.column_count()} columns and {self.row_count()} rows"

    def remove_column(self, name: str) -> None:
        self.data.drop(columns=[name], inplace=True)

    @classmethod
    def from_csv(cls, path: str, **kwargs) -> "Dataset":
        return cls(pd.read_csv(path, **kwargs))

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "Dataset":
        df = pd.DataFrame(data["data"])
        return cls(df)

    def to_csv(self, path: str, **kwargs) -> None:
        self.data.to_csv(path, **kwargs)

    @classmethod
    def from_fhir_bundle(
        cls,
        bundle: Union[Bundle, Dict[str, Any]],
        schema: Union[str, Path, FeatureSchema],
        aggregation: str = "mean",
    ) -> "Dataset":
        """Create Dataset from a FHIR Bundle using a feature schema.

        Extracts features from FHIR resources according to the schema specification,
        converting FHIR data to a pandas DataFrame suitable for ML inference.

        Args:
            bundle: FHIR Bundle resource (object or dict)
            schema: FeatureSchema object, or path to YAML schema file
            aggregation: How to aggregate multiple observation values (default: "mean")
                Options: "mean", "median", "max", "min", "last" (default: "mean")

        Returns:
            Dataset container with extracted features

        Example:
            >>> from fhir.resources.bundle import Bundle
            >>> bundle = Bundle(**patient_data)
            >>> dataset = Dataset.from_fhir_bundle(
            ...     bundle,
            ...     schema="healthchain/configs/features/sepsis_vitals.yaml"
            ... )
            >>> df = dataset.data
        """
        # Load schema if path provided
        if isinstance(schema, (str, Path)):
            schema = FeatureSchema.from_yaml(schema)

        # Extract features using mapper
        mapper = FHIRFeatureMapper(schema)
        df = mapper.extract_features(bundle, aggregation=aggregation)

        return cls(df)

    def validate(
        self, schema: FeatureSchema, raise_on_error: bool = False
    ) -> ValidationResult:
        """Validate DataFrame against a feature schema.

        Checks that required features are present and have correct data types.

        Args:
            schema: FeatureSchema to validate against
            raise_on_error: Whether to raise exception on validation failure

        Returns:
            ValidationResult with validation status and details

        Raises:
            ValueError: If raise_on_error is True and validation fails

        Example:
            >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
            >>> result = dataset.validate(schema)
            >>> if not result.valid:
            ...     print(result.errors)
        """
        result = ValidationResult(valid=True)

        # Check for missing required features
        required = schema.get_required_features()
        missing = [f for f in required if f not in self.data.columns]

        for feature in missing:
            result.add_missing_feature(feature)

        # Check data types for present features
        for feature_name, mapping in schema.features.items():
            if feature_name in self.data.columns:
                actual_dtype = str(self.data[feature_name].dtype)
                expected_dtype = mapping.dtype

                # Check for type mismatches (allow some flexibility)
                if not self._dtypes_compatible(actual_dtype, expected_dtype):
                    result.add_type_mismatch(feature_name, expected_dtype, actual_dtype)

        # Warn about optional missing features
        optional = set(schema.get_feature_names()) - set(required)
        missing_optional = [f for f in optional if f not in self.data.columns]

        for feature in missing_optional:
            result.add_warning(f"Optional feature '{feature}' is missing")

        if raise_on_error and not result.valid:
            raise ValueError(str(result))

        return result

    def _dtypes_compatible(self, actual: str, expected: str) -> bool:
        """Check if actual dtype is compatible with expected dtype.

        Args:
            actual: Actual dtype string
            expected: Expected dtype string

        Returns:
            True if dtypes are compatible
        """
        # Handle numeric types flexibly
        numeric_types = {"int64", "int32", "float64", "float32"}
        if expected in numeric_types and actual in numeric_types:
            return True

        # Exact match for non-numeric types
        return actual == expected

    def to_risk_assessment(
        self,
        outcome_code: str,
        outcome_display: str,
        outcome_system: str = "http://hl7.org/fhir/sid/icd-10",
        model_name: Optional[str] = None,
        model_version: Optional[str] = None,
        high_threshold: float = 0.7,
        moderate_threshold: float = 0.4,
        predictions: Optional[np.ndarray] = None,
        probabilities: Optional[np.ndarray] = None,
    ) -> List[RiskAssessment]:
        """Convert model predictions to FHIR RiskAssessment resources.

        Creates RiskAssessment resources from ML model output, suitable for
        including in FHIR Bundles or sending to FHIR servers.

        Args:
            outcome_code: Code for the predicted outcome (e.g., "A41.9" for sepsis)
            outcome_display: Display text for the outcome (e.g., "Sepsis")
            outcome_system: Code system for the outcome (default: ICD-10)
            model_name: Name of the ML model (optional)
            model_version: Version of the ML model (optional)
            high_threshold: Threshold for high risk (default: 0.7)
            moderate_threshold: Threshold for moderate risk (default: 0.4)
            predictions: Binary predictions array (0/1). Defaults to metadata["predictions"]
            probabilities: Probability scores array (0-1). Defaults to metadata["probabilities"]

        Returns:
            List of RiskAssessment resources, one per patient

        Example:
            >>> risk_assessments = dataset.to_risk_assessment(
            ...     outcome_code="A41.9",
            ...     outcome_display="Sepsis, unspecified",
            ...     model_name="RandomForest",
            ...     model_version="1.0"
            ... )
        """
        # Fall back to metadata if not provided
        if predictions is None:
            predictions = self.metadata.get("predictions")
        if probabilities is None:
            probabilities = self.metadata.get("probabilities")

        if predictions is None or probabilities is None:
            raise ValueError(
                "predictions and probabilities must be provided or available in metadata"
            )

        if len(predictions) != len(self.data):
            raise ValueError(
                f"Predictions length ({len(predictions)}) must match "
                f"DataFrame length ({len(self.data)})"
            )

        if len(probabilities) != len(self.data):
            raise ValueError(
                f"Probabilities length ({len(probabilities)}) must match "
                f"DataFrame length ({len(self.data)})"
            )

        risk_assessments = []

        # Get patient references
        if "patient_ref" not in self.data.columns:
            raise ValueError("DataFrame must have 'patient_ref' column")

        for idx, row in self.data.iterrows():
            patient_ref = row["patient_ref"]
            prediction = int(predictions[idx])
            probability = float(probabilities[idx])

            # Determine qualitative risk
            if probability >= high_threshold:
                qualitative_risk = "high"
            elif probability >= moderate_threshold:
                qualitative_risk = "moderate"
            else:
                qualitative_risk = "low"

            # Build prediction dict
            prediction_dict = {
                "outcome": {
                    "code": outcome_code,
                    "display": outcome_display,
                    "system": outcome_system,
                },
                "probability": probability,
                "qualitative_risk": qualitative_risk,
            }

            # Create method CodeableConcept if model info provided
            method = None
            if model_name:
                method = create_single_codeable_concept(
                    code=model_name,
                    display=f"{model_name} v{model_version}"
                    if model_version
                    else model_name,
                    system="https://healthchain.github.io/ml-models",
                )

            # Create comment with prediction details
            comment = (
                f"ML prediction: {'Positive' if prediction == 1 else 'Negative'} "
                f"(probability: {probability:.2%}, risk: {qualitative_risk})"
            )

            # Create RiskAssessment
            risk_assessment = create_risk_assessment_from_prediction(
                subject=patient_ref,
                prediction=prediction_dict,
                method=method,
                comment=comment,
            )

            risk_assessments.append(risk_assessment)

        return risk_assessments

from_fhir_bundle(bundle, schema, aggregation='mean') classmethod

Create Dataset from a FHIR Bundle using a feature schema.

Extracts features from FHIR resources according to the schema specification, converting FHIR data to a pandas DataFrame suitable for ML inference.

PARAMETER DESCRIPTION
bundle

FHIR Bundle resource (object or dict)

TYPE: Union[Bundle, Dict[str, Any]]

schema

FeatureSchema object, or path to YAML schema file

TYPE: Union[str, Path, FeatureSchema]

aggregation

How to aggregate multiple observation values (default: "mean") Options: "mean", "median", "max", "min", "last" (default: "mean")

TYPE: str DEFAULT: 'mean'

RETURNS DESCRIPTION
Dataset

Dataset container with extracted features

Example

from fhir.resources.bundle import Bundle bundle = Bundle(**patient_data) dataset = Dataset.from_fhir_bundle( ... bundle, ... schema="healthchain/configs/features/sepsis_vitals.yaml" ... ) df = dataset.data

Source code in healthchain/io/containers/dataset.py
@classmethod
def from_fhir_bundle(
    cls,
    bundle: Union[Bundle, Dict[str, Any]],
    schema: Union[str, Path, FeatureSchema],
    aggregation: str = "mean",
) -> "Dataset":
    """Create Dataset from a FHIR Bundle using a feature schema.

    Extracts features from FHIR resources according to the schema specification,
    converting FHIR data to a pandas DataFrame suitable for ML inference.

    Args:
        bundle: FHIR Bundle resource (object or dict)
        schema: FeatureSchema object, or path to YAML schema file
        aggregation: How to aggregate multiple observation values (default: "mean")
            Options: "mean", "median", "max", "min", "last" (default: "mean")

    Returns:
        Dataset container with extracted features

    Example:
        >>> from fhir.resources.bundle import Bundle
        >>> bundle = Bundle(**patient_data)
        >>> dataset = Dataset.from_fhir_bundle(
        ...     bundle,
        ...     schema="healthchain/configs/features/sepsis_vitals.yaml"
        ... )
        >>> df = dataset.data
    """
    # Load schema if path provided
    if isinstance(schema, (str, Path)):
        schema = FeatureSchema.from_yaml(schema)

    # Extract features using mapper
    mapper = FHIRFeatureMapper(schema)
    df = mapper.extract_features(bundle, aggregation=aggregation)

    return cls(df)

to_risk_assessment(outcome_code, outcome_display, outcome_system='http://hl7.org/fhir/sid/icd-10', model_name=None, model_version=None, high_threshold=0.7, moderate_threshold=0.4, predictions=None, probabilities=None)

Convert model predictions to FHIR RiskAssessment resources.

Creates RiskAssessment resources from ML model output, suitable for including in FHIR Bundles or sending to FHIR servers.

PARAMETER DESCRIPTION
outcome_code

Code for the predicted outcome (e.g., "A41.9" for sepsis)

TYPE: str

outcome_display

Display text for the outcome (e.g., "Sepsis")

TYPE: str

outcome_system

Code system for the outcome (default: ICD-10)

TYPE: str DEFAULT: 'http://hl7.org/fhir/sid/icd-10'

model_name

Name of the ML model (optional)

TYPE: Optional[str] DEFAULT: None

model_version

Version of the ML model (optional)

TYPE: Optional[str] DEFAULT: None

high_threshold

Threshold for high risk (default: 0.7)

TYPE: float DEFAULT: 0.7

moderate_threshold

Threshold for moderate risk (default: 0.4)

TYPE: float DEFAULT: 0.4

predictions

Binary predictions array (0/1). Defaults to metadata["predictions"]

TYPE: Optional[ndarray] DEFAULT: None

probabilities

Probability scores array (0-1). Defaults to metadata["probabilities"]

TYPE: Optional[ndarray] DEFAULT: None

RETURNS DESCRIPTION
List[RiskAssessment]

List of RiskAssessment resources, one per patient

Example

risk_assessments = dataset.to_risk_assessment( ... outcome_code="A41.9", ... outcome_display="Sepsis, unspecified", ... model_name="RandomForest", ... model_version="1.0" ... )

Source code in healthchain/io/containers/dataset.py
def to_risk_assessment(
    self,
    outcome_code: str,
    outcome_display: str,
    outcome_system: str = "http://hl7.org/fhir/sid/icd-10",
    model_name: Optional[str] = None,
    model_version: Optional[str] = None,
    high_threshold: float = 0.7,
    moderate_threshold: float = 0.4,
    predictions: Optional[np.ndarray] = None,
    probabilities: Optional[np.ndarray] = None,
) -> List[RiskAssessment]:
    """Convert model predictions to FHIR RiskAssessment resources.

    Creates RiskAssessment resources from ML model output, suitable for
    including in FHIR Bundles or sending to FHIR servers.

    Args:
        outcome_code: Code for the predicted outcome (e.g., "A41.9" for sepsis)
        outcome_display: Display text for the outcome (e.g., "Sepsis")
        outcome_system: Code system for the outcome (default: ICD-10)
        model_name: Name of the ML model (optional)
        model_version: Version of the ML model (optional)
        high_threshold: Threshold for high risk (default: 0.7)
        moderate_threshold: Threshold for moderate risk (default: 0.4)
        predictions: Binary predictions array (0/1). Defaults to metadata["predictions"]
        probabilities: Probability scores array (0-1). Defaults to metadata["probabilities"]

    Returns:
        List of RiskAssessment resources, one per patient

    Example:
        >>> risk_assessments = dataset.to_risk_assessment(
        ...     outcome_code="A41.9",
        ...     outcome_display="Sepsis, unspecified",
        ...     model_name="RandomForest",
        ...     model_version="1.0"
        ... )
    """
    # Fall back to metadata if not provided
    if predictions is None:
        predictions = self.metadata.get("predictions")
    if probabilities is None:
        probabilities = self.metadata.get("probabilities")

    if predictions is None or probabilities is None:
        raise ValueError(
            "predictions and probabilities must be provided or available in metadata"
        )

    if len(predictions) != len(self.data):
        raise ValueError(
            f"Predictions length ({len(predictions)}) must match "
            f"DataFrame length ({len(self.data)})"
        )

    if len(probabilities) != len(self.data):
        raise ValueError(
            f"Probabilities length ({len(probabilities)}) must match "
            f"DataFrame length ({len(self.data)})"
        )

    risk_assessments = []

    # Get patient references
    if "patient_ref" not in self.data.columns:
        raise ValueError("DataFrame must have 'patient_ref' column")

    for idx, row in self.data.iterrows():
        patient_ref = row["patient_ref"]
        prediction = int(predictions[idx])
        probability = float(probabilities[idx])

        # Determine qualitative risk
        if probability >= high_threshold:
            qualitative_risk = "high"
        elif probability >= moderate_threshold:
            qualitative_risk = "moderate"
        else:
            qualitative_risk = "low"

        # Build prediction dict
        prediction_dict = {
            "outcome": {
                "code": outcome_code,
                "display": outcome_display,
                "system": outcome_system,
            },
            "probability": probability,
            "qualitative_risk": qualitative_risk,
        }

        # Create method CodeableConcept if model info provided
        method = None
        if model_name:
            method = create_single_codeable_concept(
                code=model_name,
                display=f"{model_name} v{model_version}"
                if model_version
                else model_name,
                system="https://healthchain.github.io/ml-models",
            )

        # Create comment with prediction details
        comment = (
            f"ML prediction: {'Positive' if prediction == 1 else 'Negative'} "
            f"(probability: {probability:.2%}, risk: {qualitative_risk})"
        )

        # Create RiskAssessment
        risk_assessment = create_risk_assessment_from_prediction(
            subject=patient_ref,
            prediction=prediction_dict,
            method=method,
            comment=comment,
        )

        risk_assessments.append(risk_assessment)

    return risk_assessments

validate(schema, raise_on_error=False)

Validate DataFrame against a feature schema.

Checks that required features are present and have correct data types.

PARAMETER DESCRIPTION
schema

FeatureSchema to validate against

TYPE: FeatureSchema

raise_on_error

Whether to raise exception on validation failure

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
ValidationResult

ValidationResult with validation status and details

RAISES DESCRIPTION
ValueError

If raise_on_error is True and validation fails

Example

schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml") result = dataset.validate(schema) if not result.valid: ... print(result.errors)

Source code in healthchain/io/containers/dataset.py
def validate(
    self, schema: FeatureSchema, raise_on_error: bool = False
) -> ValidationResult:
    """Validate DataFrame against a feature schema.

    Checks that required features are present and have correct data types.

    Args:
        schema: FeatureSchema to validate against
        raise_on_error: Whether to raise exception on validation failure

    Returns:
        ValidationResult with validation status and details

    Raises:
        ValueError: If raise_on_error is True and validation fails

    Example:
        >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
        >>> result = dataset.validate(schema)
        >>> if not result.valid:
        ...     print(result.errors)
    """
    result = ValidationResult(valid=True)

    # Check for missing required features
    required = schema.get_required_features()
    missing = [f for f in required if f not in self.data.columns]

    for feature in missing:
        result.add_missing_feature(feature)

    # Check data types for present features
    for feature_name, mapping in schema.features.items():
        if feature_name in self.data.columns:
            actual_dtype = str(self.data[feature_name].dtype)
            expected_dtype = mapping.dtype

            # Check for type mismatches (allow some flexibility)
            if not self._dtypes_compatible(actual_dtype, expected_dtype):
                result.add_type_mismatch(feature_name, expected_dtype, actual_dtype)

    # Warn about optional missing features
    optional = set(schema.get_feature_names()) - set(required)
    missing_optional = [f for f in optional if f not in self.data.columns]

    for feature in missing_optional:
        result.add_warning(f"Optional feature '{feature}' is missing")

    if raise_on_error and not result.valid:
        raise ValueError(str(result))

    return result

document

CdsAnnotations dataclass

Container for Clinical Decision Support (CDS) results.

This class stores and manages outputs from clinical decision support systems, including CDS Hooks cards and suggested clinical actions. The cards contain recommendations, warnings, and other decision support content that can be displayed to clinicians. Actions represent specific clinical tasks or interventions that are suggested based on the analysis.

ATTRIBUTE DESCRIPTION
_cards

CDS Hooks cards containing clinical recommendations, warnings, or other decision support content.

TYPE: Optional[List[Card]]

_actions

Suggested clinical actions that could be taken based on the CDS analysis.

TYPE: Optional[List[Action]]

Example

cds = CdsAnnotations() cds.cards = [Card(summary="Consider aspirin")] cds.actions = [Action(type="create", description="Order aspirin")]

Source code in healthchain/io/containers/document.py
@dataclass
class CdsAnnotations:
    """
    Container for Clinical Decision Support (CDS) results.

    This class stores and manages outputs from clinical decision support systems,
    including CDS Hooks cards and suggested clinical actions. The cards contain
    recommendations, warnings, and other decision support content that can be
    displayed to clinicians. Actions represent specific clinical tasks or
    interventions that are suggested based on the analysis.

    Attributes:
        _cards (Optional[List[Card]]): CDS Hooks cards containing clinical
            recommendations, warnings, or other decision support content.
        _actions (Optional[List[Action]]): Suggested clinical actions that
            could be taken based on the CDS analysis.

    Example:
        >>> cds = CdsAnnotations()
        >>> cds.cards = [Card(summary="Consider aspirin")]
        >>> cds.actions = [Action(type="create", description="Order aspirin")]
    """

    _cards: Optional[List[Card]] = None
    _actions: Optional[List[Action]] = None

    @property
    def cards(self) -> Optional[List[Card]]:
        """Get the current list of CDS Hooks cards."""
        return self._cards

    @cards.setter
    def cards(self, cards: Union[List[Card], List[Dict[str, Any]]]) -> None:
        """
        Set CDS Hooks cards, converting from dictionaries if needed.

        Args:
            cards: List of Card objects or dictionaries that can be converted to Cards.

        Raises:
            ValueError: If cards list is empty or has invalid format.
            TypeError: If cards are neither Card objects nor dictionaries.
        """
        if not cards:
            raise ValueError("Cards must be provided as a list!")

        try:
            if isinstance(cards[0], dict):
                self._cards = [Card(**card) for card in cards]
            elif isinstance(cards[0], Card):
                self._cards = cards
            else:
                raise TypeError("Cards must be either Card objects or dictionaries")
        except (IndexError, KeyError) as e:
            raise ValueError("Invalid card format") from e

    @property
    def actions(self) -> Optional[List[Action]]:
        """Get the current list of suggested clinical actions."""
        return self._actions

    @actions.setter
    def actions(self, actions: Union[List[Action], List[Dict[str, Any]]]) -> None:
        """
        Set suggested clinical actions, converting from dictionaries if needed.

        Args:
            actions: List of Action objects or dictionaries that can be converted to Actions.

        Raises:
            ValueError: If actions list is empty or has invalid format.
            TypeError: If actions are neither Action objects nor dictionaries.
        """
        if not actions:
            raise ValueError("Actions must be provided as a list!")

        try:
            if isinstance(actions[0], dict):
                self._actions = [Action(**action) for action in actions]
            elif isinstance(actions[0], Action):
                self._actions = actions
            else:
                raise TypeError("Actions must be either Action objects or dictionaries")
        except (IndexError, KeyError) as e:
            raise ValueError("Invalid action format") from e

actions property writable

Get the current list of suggested clinical actions.

cards property writable

Get the current list of CDS Hooks cards.

Document dataclass

Bases: BaseDocument

Main document container for processing textual and clinical data in HealthChain.

The Document class is the primary structure used throughout annotation and analytics pipelines, accumulating transformations, extractions, and results from each stage. It seamlessly integrates raw text, NLP annotations, FHIR resources, clinical decision support (CDS) results, and ML model outputs in one object.

Features
  • Accepts text, FHIR Bundles/resources, or lists of FHIR resources as input.
  • Provides basic tokenization and supports integration with NLP models (spaCy, transformers).
  • Stores and manipulates clinical FHIR data via the .fhir property (access to bundles, problem lists, meds, allergies, etc.).
  • Encapsulates CDS Hooks-style decision support cards and suggested actions via the .cds property.
  • Stores outputs from external ML/LLM models: HuggingFace, LangChain, etc.
ATTRIBUTE DESCRIPTION
nlp

NLP output (tokens, entities, embeddings, spaCy doc)

TYPE: NlpAnnotations

fhir

FHIR resources and context (problem list, medication, allergy, etc.)

TYPE: FhirData

cds

Clinical decision support (cards and actions)

TYPE: CdsAnnotations

models

Results from ML/LLM models (HuggingFace, LangChain, etc.)

TYPE: ModelOutputs

text

The text content of the document (if available).

TYPE: str

data

The original input supplied (raw text, Bundle, resource, or list of resources)

TYPE: str

Usage example

doc = Document(data="Patient has hypertension") doc.nlp._tokens ['Patient', 'has', 'hypertension'] doc.fhir.problem_list = [Condition(...)] doc.cds.cards = [Card(...)] doc.models.huggingface_results = ... for token in doc: ... print(token)

Inherits from

BaseDocument

Source code in healthchain/io/containers/document.py
@dataclass
class Document(BaseDocument):
    """
    Main document container for processing textual and clinical data in HealthChain.

    The Document class is the primary structure used throughout annotation and analytics
    pipelines, accumulating transformations, extractions, and results from each stage. It
    seamlessly integrates raw text, NLP annotations, FHIR resources, clinical decision
    support (CDS) results, and ML model outputs in one object.

    Features:
        - Accepts text, FHIR Bundles/resources, or lists of FHIR resources as input.
        - Provides basic tokenization and supports integration with NLP models (spaCy, transformers).
        - Stores and manipulates clinical FHIR data via the .fhir property (access to bundles, problem lists, meds, allergies, etc.).
        - Encapsulates CDS Hooks-style decision support cards and suggested actions via the .cds property.
        - Stores outputs from external ML/LLM models: HuggingFace, LangChain, etc.

    Attributes:
        nlp (NlpAnnotations): NLP output (tokens, entities, embeddings, spaCy doc)
        fhir (FhirData): FHIR resources and context (problem list, medication, allergy, etc.)
        cds (CdsAnnotations): Clinical decision support (cards and actions)
        models (ModelOutputs): Results from ML/LLM models (HuggingFace, LangChain, etc.)
        text (str): The text content of the document (if available).
        data: The original input supplied (raw text, Bundle, resource, or list of resources)

    Usage example:
        >>> doc = Document(data="Patient has hypertension")
        >>> doc.nlp._tokens
        ['Patient', 'has', 'hypertension']
        >>> doc.fhir.problem_list = [Condition(...)]
        >>> doc.cds.cards = [Card(...)]
        >>> doc.models.huggingface_results = ...
        >>> for token in doc:
        ...     print(token)

    Inherits from:
        BaseDocument
    """

    _nlp: NlpAnnotations = field(default_factory=NlpAnnotations)
    _fhir: FhirData = field(default_factory=FhirData)
    _cds: CdsAnnotations = field(default_factory=CdsAnnotations)
    _models: ModelOutputs = field(default_factory=ModelOutputs)

    @property
    def nlp(self) -> NlpAnnotations:
        return self._nlp

    @property
    def fhir(self) -> FhirData:
        return self._fhir

    @property
    def cds(self) -> CdsAnnotations:
        return self._cds

    @property
    def models(self) -> ModelOutputs:
        return self._models

    def __post_init__(self):
        """
        Post-initialization setup to process textual or FHIR data.

        - If input data is a FHIR Bundle, stores it and extracts OperationOutcome and Provenance resources.
        - If input data is a list of FHIR resources, wraps them in a Bundle.
        - For text input, sets .text field accordingly.
        - Performs basic whitespace tokenization if necessary.
        """
        super().__post_init__()

        # Handle FHIR Bundle data
        if isinstance(self.data, Bundle):
            self._fhir._bundle = self.data

            # Extract OperationOutcome resources (operation results/errors)
            outcomes = extract_resources(self._fhir._bundle, "OperationOutcome")
            if outcomes:
                self._fhir._operation_outcomes = outcomes

            # Extract Provenance resources (data lineage/origin)
            provenances = extract_resources(self._fhir._bundle, "Provenance")
            if provenances:
                self._fhir._provenances = provenances

            self.text = ""  # No text content for bundle-only documents
        # Handle list of FHIR resources
        elif (
            isinstance(self.data, list)
            and self.data
            and isinstance(self.data[0], Resource)
        ):
            self._fhir._bundle = create_bundle()
            for resource in self.data:
                add_resource(self._fhir._bundle, resource)
            self.text = ""  # No text content for resource-only documents
        else:
            # Handle text data
            self.text = self.data if isinstance(self.data, str) else str(self.data)

        if not self._nlp._tokens and self.text:
            self._nlp._tokens = self.text.split()  # Basic tokenization if not provided

    def word_count(self) -> int:
        """
        Return the number of word tokens in the document.

        Returns:
            int: The count of tokenized words in the document.
        """
        return len(self._nlp._tokens)

    def update_problem_list_from_nlp(
        self,
        patient_ref: str = "Patient/123",
        coding_system: str = "http://snomed.info/sct",
        code_attribute: str = "cui",
    ):
        """
        Populate or update the problem list using entities extracted via NLP.

        This method looks for entities with associated medical codes and creates FHIR Condition
        resources from them. It supports a two-step process:
        1. NER: Extract entities from text (spaCy, HuggingFace, etc.)
        2. Entity Linking: Add medical codes to those entities
        3. Problem List Creation: Convert linked entities to FHIR conditions (this method)

        The method extracts from:
        1. spaCy entities with extension attributes (e.g., ent._.cui)
        2. Generic entities in the NLP annotations container (framework-agnostic)

        TODO: make this more generic and support other resource types

        Args:
            patient_ref: FHIR reference to the patient (default: "Patient/123")
            coding_system: Coding system URI for the conditions (default: SNOMED CT)
            code_attribute: Name of the attribute containing the medical code (default: "cui")

        Notes:
            - Preserves any existing problem list Conditions.
            - Supports framework-agnostic extraction (spaCy and dict entities).
            - For spaCy, looks for entity extension attribute (e.g. ent._.cui).
            - For non-spaCy, expects codes as dict keys (ent["cui"], etc.).
        """
        # Start with existing conditions to preserve them
        existing_conditions = self.fhir.problem_list.copy()
        new_conditions = []

        # 1. Extract from spaCy entities (if available)
        if self.nlp._spacy_doc and self.nlp._spacy_doc.ents:
            for ent in self.nlp._spacy_doc.ents:
                if not Span.has_extension(code_attribute):
                    logger.debug(
                        f"Extension '{code_attribute}' not found for spaCy entity {ent.text}"
                    )
                    continue

                code_value = getattr(ent._, code_attribute, None)
                if code_value is None:
                    logger.debug(
                        f"No {code_attribute} found for spaCy entity {ent.text}"
                    )
                    continue

                condition = create_condition(
                    subject=patient_ref,
                    code=code_value,
                    display=ent.text,
                    system=coding_system,
                )
                set_condition_category(condition, "problem-list-item")
                logger.debug(
                    f"Adding condition from spaCy: {condition.model_dump(exclude_none=True)}"
                )
                new_conditions.append(condition)

        # 2. Extract from generic NLP entities (framework-agnostic)
        generic_entities = self.nlp.get_entities()
        if generic_entities:
            for ent_dict in generic_entities:
                # Skip if no linked code
                code_value = ent_dict.get(code_attribute)
                if code_value is None:
                    logger.debug(
                        f"No {code_attribute} found for entity {ent_dict.get('text', 'unknown')}"
                    )
                    continue

                entity_text = ent_dict.get("text", "unknown")

                condition = create_condition(
                    subject=patient_ref,
                    code=code_value,
                    display=entity_text,
                    system=coding_system,
                )
                set_condition_category(condition, "problem-list-item")
                logger.debug(
                    f"Adding condition from entities: {condition.model_dump(exclude_none=True)}"
                )
                new_conditions.append(condition)

        # Update problem list with combined conditions (replace to avoid duplication)
        if new_conditions:
            all_conditions = existing_conditions + new_conditions
            self.fhir.add_resources(all_conditions, "Condition", replace=True)

    def __iter__(self) -> Iterator[str]:
        """
        Iterate through the document's tokens.

        Returns:
            Iterator[str]: Iterator over the document tokens.
        """
        return iter(self._nlp._tokens)

    def __len__(self) -> int:
        """
        Return the length of the document's text.

        Returns:
            int: Character length of the document text.
        """
        return len(self.text)

__iter__()

Iterate through the document's tokens.

RETURNS DESCRIPTION
Iterator[str]

Iterator[str]: Iterator over the document tokens.

Source code in healthchain/io/containers/document.py
def __iter__(self) -> Iterator[str]:
    """
    Iterate through the document's tokens.

    Returns:
        Iterator[str]: Iterator over the document tokens.
    """
    return iter(self._nlp._tokens)

__len__()

Return the length of the document's text.

RETURNS DESCRIPTION
int

Character length of the document text.

TYPE: int

Source code in healthchain/io/containers/document.py
def __len__(self) -> int:
    """
    Return the length of the document's text.

    Returns:
        int: Character length of the document text.
    """
    return len(self.text)

__post_init__()

Post-initialization setup to process textual or FHIR data.

  • If input data is a FHIR Bundle, stores it and extracts OperationOutcome and Provenance resources.
  • If input data is a list of FHIR resources, wraps them in a Bundle.
  • For text input, sets .text field accordingly.
  • Performs basic whitespace tokenization if necessary.
Source code in healthchain/io/containers/document.py
def __post_init__(self):
    """
    Post-initialization setup to process textual or FHIR data.

    - If input data is a FHIR Bundle, stores it and extracts OperationOutcome and Provenance resources.
    - If input data is a list of FHIR resources, wraps them in a Bundle.
    - For text input, sets .text field accordingly.
    - Performs basic whitespace tokenization if necessary.
    """
    super().__post_init__()

    # Handle FHIR Bundle data
    if isinstance(self.data, Bundle):
        self._fhir._bundle = self.data

        # Extract OperationOutcome resources (operation results/errors)
        outcomes = extract_resources(self._fhir._bundle, "OperationOutcome")
        if outcomes:
            self._fhir._operation_outcomes = outcomes

        # Extract Provenance resources (data lineage/origin)
        provenances = extract_resources(self._fhir._bundle, "Provenance")
        if provenances:
            self._fhir._provenances = provenances

        self.text = ""  # No text content for bundle-only documents
    # Handle list of FHIR resources
    elif (
        isinstance(self.data, list)
        and self.data
        and isinstance(self.data[0], Resource)
    ):
        self._fhir._bundle = create_bundle()
        for resource in self.data:
            add_resource(self._fhir._bundle, resource)
        self.text = ""  # No text content for resource-only documents
    else:
        # Handle text data
        self.text = self.data if isinstance(self.data, str) else str(self.data)

    if not self._nlp._tokens and self.text:
        self._nlp._tokens = self.text.split()  # Basic tokenization if not provided

update_problem_list_from_nlp(patient_ref='Patient/123', coding_system='http://snomed.info/sct', code_attribute='cui')

Populate or update the problem list using entities extracted via NLP.

This method looks for entities with associated medical codes and creates FHIR Condition resources from them. It supports a two-step process: 1. NER: Extract entities from text (spaCy, HuggingFace, etc.) 2. Entity Linking: Add medical codes to those entities 3. Problem List Creation: Convert linked entities to FHIR conditions (this method)

The method extracts from: 1. spaCy entities with extension attributes (e.g., ent._.cui) 2. Generic entities in the NLP annotations container (framework-agnostic)

TODO: make this more generic and support other resource types

PARAMETER DESCRIPTION
patient_ref

FHIR reference to the patient (default: "Patient/123")

TYPE: str DEFAULT: 'Patient/123'

coding_system

Coding system URI for the conditions (default: SNOMED CT)

TYPE: str DEFAULT: 'http://snomed.info/sct'

code_attribute

Name of the attribute containing the medical code (default: "cui")

TYPE: str DEFAULT: 'cui'

Notes
  • Preserves any existing problem list Conditions.
  • Supports framework-agnostic extraction (spaCy and dict entities).
  • For spaCy, looks for entity extension attribute (e.g. ent._.cui).
  • For non-spaCy, expects codes as dict keys (ent["cui"], etc.).
Source code in healthchain/io/containers/document.py
def update_problem_list_from_nlp(
    self,
    patient_ref: str = "Patient/123",
    coding_system: str = "http://snomed.info/sct",
    code_attribute: str = "cui",
):
    """
    Populate or update the problem list using entities extracted via NLP.

    This method looks for entities with associated medical codes and creates FHIR Condition
    resources from them. It supports a two-step process:
    1. NER: Extract entities from text (spaCy, HuggingFace, etc.)
    2. Entity Linking: Add medical codes to those entities
    3. Problem List Creation: Convert linked entities to FHIR conditions (this method)

    The method extracts from:
    1. spaCy entities with extension attributes (e.g., ent._.cui)
    2. Generic entities in the NLP annotations container (framework-agnostic)

    TODO: make this more generic and support other resource types

    Args:
        patient_ref: FHIR reference to the patient (default: "Patient/123")
        coding_system: Coding system URI for the conditions (default: SNOMED CT)
        code_attribute: Name of the attribute containing the medical code (default: "cui")

    Notes:
        - Preserves any existing problem list Conditions.
        - Supports framework-agnostic extraction (spaCy and dict entities).
        - For spaCy, looks for entity extension attribute (e.g. ent._.cui).
        - For non-spaCy, expects codes as dict keys (ent["cui"], etc.).
    """
    # Start with existing conditions to preserve them
    existing_conditions = self.fhir.problem_list.copy()
    new_conditions = []

    # 1. Extract from spaCy entities (if available)
    if self.nlp._spacy_doc and self.nlp._spacy_doc.ents:
        for ent in self.nlp._spacy_doc.ents:
            if not Span.has_extension(code_attribute):
                logger.debug(
                    f"Extension '{code_attribute}' not found for spaCy entity {ent.text}"
                )
                continue

            code_value = getattr(ent._, code_attribute, None)
            if code_value is None:
                logger.debug(
                    f"No {code_attribute} found for spaCy entity {ent.text}"
                )
                continue

            condition = create_condition(
                subject=patient_ref,
                code=code_value,
                display=ent.text,
                system=coding_system,
            )
            set_condition_category(condition, "problem-list-item")
            logger.debug(
                f"Adding condition from spaCy: {condition.model_dump(exclude_none=True)}"
            )
            new_conditions.append(condition)

    # 2. Extract from generic NLP entities (framework-agnostic)
    generic_entities = self.nlp.get_entities()
    if generic_entities:
        for ent_dict in generic_entities:
            # Skip if no linked code
            code_value = ent_dict.get(code_attribute)
            if code_value is None:
                logger.debug(
                    f"No {code_attribute} found for entity {ent_dict.get('text', 'unknown')}"
                )
                continue

            entity_text = ent_dict.get("text", "unknown")

            condition = create_condition(
                subject=patient_ref,
                code=code_value,
                display=entity_text,
                system=coding_system,
            )
            set_condition_category(condition, "problem-list-item")
            logger.debug(
                f"Adding condition from entities: {condition.model_dump(exclude_none=True)}"
            )
            new_conditions.append(condition)

    # Update problem list with combined conditions (replace to avoid duplication)
    if new_conditions:
        all_conditions = existing_conditions + new_conditions
        self.fhir.add_resources(all_conditions, "Condition", replace=True)

word_count()

Return the number of word tokens in the document.

RETURNS DESCRIPTION
int

The count of tokenized words in the document.

TYPE: int

Source code in healthchain/io/containers/document.py
def word_count(self) -> int:
    """
    Return the number of word tokens in the document.

    Returns:
        int: The count of tokenized words in the document.
    """
    return len(self._nlp._tokens)

FhirData dataclass

Container for FHIR resource data and its context.

Stores and manages clinical data in FHIR format. Access document references within resources easily through convenience functions.

Also allows you to set common continuity of care lists, such as a problem list, medication list, and allergy list. These collections are accessible as properties of the class instance.

TODO: make problem, meds, allergy lists configurable

Properties

bundle: The FHIR bundle containing resources prefetch_resources: Dictionary of CDS Hooks prefetch resources problem_list: List of Condition resources medication_list: List of MedicationStatement resources allergy_list: List of AllergyIntolerance resources

Example

fhir = FhirData()

Add prefetch resources from CDS request

fhir.prefetch_resources = {"patient": patient_resource}

Add document to bundle

doc_id = fhir.add_document_reference(document)

Get document with relationships

doc_family = fhir.get_document_reference_family(doc_id)

Access clinical lists

conditions = fhir.problem_list

Source code in healthchain/io/containers/document.py
@dataclass
class FhirData:
    """
    Container for FHIR resource data and its context.

    Stores and manages clinical data in FHIR format.
    Access document references within resources easily through convenience functions.

    Also allows you to set common continuity of care lists,
    such as a problem list, medication list, and allergy list.
    These collections are accessible as properties of the class instance.

    TODO: make problem, meds, allergy lists configurable

    Properties:
        bundle: The FHIR bundle containing resources
        prefetch_resources: Dictionary of CDS Hooks prefetch resources
        problem_list: List of Condition resources
        medication_list: List of MedicationStatement resources
        allergy_list: List of AllergyIntolerance resources

    Example:
        >>> fhir = FhirData()
        >>> # Add prefetch resources from CDS request
        >>> fhir.prefetch_resources = {"patient": patient_resource}
        >>> # Add document to bundle
        >>> doc_id = fhir.add_document_reference(document)
        >>> # Get document with relationships
        >>> doc_family = fhir.get_document_reference_family(doc_id)
        >>> # Access clinical lists
        >>> conditions = fhir.problem_list
    """

    _prefetch_resources: Optional[Dict[str, Resource]] = None
    _bundle: Optional[Bundle] = None
    _operation_outcomes: List[OperationOutcome] = field(default_factory=list)
    _provenances: List[Provenance] = field(default_factory=list)

    @property
    def bundle(self) -> Optional[Bundle]:
        """Returns the FHIR Bundle if it exists."""
        return self._bundle

    @bundle.setter
    def bundle(self, bundle: Bundle):
        """Sets the FHIR Bundle.
        The bundle is a collection of FHIR resources.
        See: https://www.hl7.org/fhir/bundle.html
        """
        self._bundle = bundle

    @property
    def prefetch_resources(self) -> Optional[Dict[str, Resource]]:
        """Returns the prefetch FHIR resources."""
        return self._prefetch_resources

    @prefetch_resources.setter
    def prefetch_resources(self, resources: Dict[str, Resource]):
        """Sets the prefetch FHIR resources from CDS service requests."""
        self._prefetch_resources = resources

    @property
    def operation_outcomes(self) -> List[OperationOutcome]:
        """Get extracted OperationOutcome resources separated from the bundle."""
        return self._operation_outcomes

    @operation_outcomes.setter
    def operation_outcomes(self, outcomes: List[OperationOutcome]) -> None:
        self._operation_outcomes = outcomes or []

    @property
    def provenances(self) -> List[Provenance]:
        """Get extracted Provenance resources separated from the bundle."""
        return self._provenances

    @provenances.setter
    def provenances(self, provenances: List[Provenance]) -> None:
        self._provenances = provenances or []

    @property
    def patient(self) -> Optional[Patient]:
        """Get the first Patient resource from the bundle (convenience accessor).

        Returns None if no Patient resources are present in the bundle.
        For bundles with multiple patients, use the patients property instead.
        """
        patients = self.get_resources("Patient")
        return patients[0] if patients else None

    @property
    def patients(self) -> List[Patient]:
        """Get all Patient resources from the bundle.

        Most bundles contain a single patient, but some queries (e.g., family history,
        population queries) may return multiple patients. This property provides access
        to all Patient resources without removing them from the bundle.
        """
        return self.get_resources("Patient")

    @property
    def problem_list(self) -> List[Condition]:
        """Get problem list from the bundle.
        Problem list items are stored as Condition resources in the bundle.
        See: https://www.hl7.org/fhir/condition.html
        """
        return self.get_resources("Condition")

    @problem_list.setter
    def problem_list(self, conditions: List[Condition]) -> None:
        # TODO: should make this behaviour more explicit whether it's adding or replacing
        """Set problem list in the bundle."""
        self.add_resources(conditions, "Condition")

    @property
    def medication_list(self) -> List[MedicationStatement]:
        """Get medication list from the bundle."""
        return self.get_resources("MedicationStatement")

    @medication_list.setter
    def medication_list(self, medications: List[MedicationStatement]) -> None:
        """Set medication list in the bundle.
        Medication statements are stored as MedicationStatement resources in the bundle.
        See: https://www.hl7.org/fhir/medicationstatement.html
        """
        self.add_resources(medications, "MedicationStatement")

    @property
    def allergy_list(self) -> List[AllergyIntolerance]:
        """Get allergy list from the bundle."""
        return self.get_resources("AllergyIntolerance")

    @allergy_list.setter
    def allergy_list(self, allergies: List[AllergyIntolerance]) -> None:
        """Set allergy list in the bundle.
        Allergy intolerances are stored as AllergyIntolerance resources in the bundle.
        See: https://www.hl7.org/fhir/allergyintolerance.html
        """
        self.add_resources(allergies, "AllergyIntolerance")

    def get_prefetch_resources(self, key: str) -> List[Any]:
        """Get resources of a specific type from the prefetch bundle."""
        if not self._prefetch_resources:
            return []
        return self._prefetch_resources.get(key, [])

    def get_resources(self, resource_type: str) -> List[Any]:
        """Get resources of a specific type from the working bundle."""
        if not self._bundle:
            return []
        return get_resources(self._bundle, resource_type)

    def add_resources(
        self, resources: List[Any], resource_type: str, replace: bool = False
    ):
        """Add resources to the working bundle."""
        if not self._bundle:
            self._bundle = create_bundle()
        set_resources(self._bundle, resources, resource_type, replace=replace)

    def add_document_reference(
        self,
        document: DocumentReference,
        parent_id: Optional[str] = None,
        relationship_type: Optional[str] = "transforms",
    ) -> str:
        """
        Adds a DocumentReference resource to the FHIR bundle and establishes
        relationships between documents if a parent_id is provided. The relationship is
        tracked using the FHIR relatesTo element with a specified relationship type.
        See: https://build.fhir.org/documentreference-definitions.html#DocumentReference.relatesTo

        Args:
            document: The DocumentReference to add to the bundle
            parent_id: Optional ID of the parent document. If provided, establishes a
                relationship between this document and its parent.
            relationship_type: The type of relationship to establish with the parent
                document. Defaults to "transforms". This is used in the FHIR relatesTo
                element's code. See: http://hl7.org/fhir/valueset-document-relationship-type

        Returns:
            str: The ID of the added document. If the document had no ID, a new UUID-based
                ID is generated.
        """
        # Generate a consistent ID if not present
        if not document.id:
            document.id = f"doc-{uuid4()}"

        # Add relationship metadata if there's a parent
        if parent_id:
            if not hasattr(document, "relatesTo") or not document.relatesTo:
                document.relatesTo = []
            document.relatesTo.append(
                DocumentReferenceRelatesTo(
                    target=Reference(reference=f"DocumentReference/{parent_id}"),
                    code=create_single_codeable_concept(
                        code=relationship_type,
                        display=relationship_type.capitalize(),
                        system="http://hl7.org/fhir/ValueSet/document-relationship-type",
                    ),
                )
            )

        self.add_resources([document], "DocumentReference", replace=False)

        return document.id

    def get_document_references_readable(
        self, include_data: bool = True, include_relationships: bool = True
    ) -> List[Dict[str, Any]]:
        """
        Get DocumentReferences resources with their content and optional relationship data
        in a human-readable dictionary format.

        Args:
            include_data: If True, decode and include the document data (default: True)
            include_relationships: If True, include related document information (default: True)

        Returns:
            List of documents with metadata and optionally their content and relationships
        """
        documents = []
        for doc in self.get_resources("DocumentReference"):
            doc_data = {
                "id": doc.id,
                "description": doc.description,
                "status": doc.status,
            }

            attachments = read_content_attachment(doc, include_data=include_data)
            if attachments:
                doc_data["attachments"] = []
                for attachment in attachments:
                    if include_data:
                        doc_data["attachments"].append(
                            {
                                "data": attachment.get("data"),
                                "metadata": attachment.get("metadata"),
                            }
                        )
                    else:
                        doc_data["attachments"].append(
                            {"metadata": attachment.get("metadata")}
                        )

            if include_relationships:
                family = self.get_document_reference_family(doc.id)
                doc_data["relationships"] = {
                    "parents": [
                        {"id": p.id, "description": p.description}
                        for p in family["parents"]
                    ],
                    "children": [
                        {"id": c.id, "description": c.description}
                        for c in family["children"]
                    ],
                    "siblings": [
                        {"id": s.id, "description": s.description}
                        for s in family["siblings"]
                    ],
                }

            documents.append(doc_data)

        return documents

    def get_document_reference_family(self, document_id: str) -> Dict[str, Any]:
        """
        Get a DocumentReference resource and all its related resources
        based on the relatesTo element in the FHIR standard.
        See: https://build.fhir.org/documentreference-definitions.html#DocumentReference.relatesTo

        Args:
            document_id: ID of the DocumentReference resource to find relationships for

        Returns:
            Dict containing:
                'document': The requested DocumentReference resource
                'parents': List of parent DocumentReference resources
                'children': List of child DocumentReference resources
                'siblings': List of DocumentReference resources sharing the same parent
        """
        documents = self.get_resources("DocumentReference")
        family = {"document": None, "parents": [], "children": [], "siblings": []}

        # Find the requested document
        target_doc = next((doc for doc in documents if doc.id == document_id), None)
        if not target_doc:
            return family

        family["document"] = target_doc

        # Find direct relationships
        if hasattr(target_doc, "relatesTo") and target_doc.relatesTo:
            # Find parents from target's relationships
            for relation in target_doc.relatesTo:
                parent_ref = relation.target.reference
                parent_id = parent_ref.split("/")[-1]
                parent = next((doc for doc in documents if doc.id == parent_id), None)
                if parent:
                    family["parents"].append(parent)

        # Find children and siblings
        for doc in documents:
            if not hasattr(doc, "relatesTo") or not doc.relatesTo:
                continue

            for relation in doc.relatesTo:
                target_ref = relation.target.reference
                related_id = target_ref.split("/")[-1]

                # Check if this doc is a child of our target
                if related_id == document_id:
                    family["children"].append(doc)

                # For siblings, check if they share the same parent
                elif family["parents"] and related_id == family["parents"][0].id:
                    if doc.id != document_id:  # Don't include self as sibling
                        family["siblings"].append(doc)

        return family

allergy_list property writable

Get allergy list from the bundle.

bundle property writable

Returns the FHIR Bundle if it exists.

medication_list property writable

Get medication list from the bundle.

operation_outcomes property writable

Get extracted OperationOutcome resources separated from the bundle.

patient property

Get the first Patient resource from the bundle (convenience accessor).

Returns None if no Patient resources are present in the bundle. For bundles with multiple patients, use the patients property instead.

patients property

Get all Patient resources from the bundle.

Most bundles contain a single patient, but some queries (e.g., family history, population queries) may return multiple patients. This property provides access to all Patient resources without removing them from the bundle.

prefetch_resources property writable

Returns the prefetch FHIR resources.

problem_list property writable

Get problem list from the bundle. Problem list items are stored as Condition resources in the bundle. See: https://www.hl7.org/fhir/condition.html

provenances property writable

Get extracted Provenance resources separated from the bundle.

add_document_reference(document, parent_id=None, relationship_type='transforms')

Adds a DocumentReference resource to the FHIR bundle and establishes relationships between documents if a parent_id is provided. The relationship is tracked using the FHIR relatesTo element with a specified relationship type. See: https://build.fhir.org/documentreference-definitions.html#DocumentReference.relatesTo

PARAMETER DESCRIPTION
document

The DocumentReference to add to the bundle

TYPE: DocumentReference

parent_id

Optional ID of the parent document. If provided, establishes a relationship between this document and its parent.

TYPE: Optional[str] DEFAULT: None

relationship_type

The type of relationship to establish with the parent document. Defaults to "transforms". This is used in the FHIR relatesTo element's code. See: http://hl7.org/fhir/valueset-document-relationship-type

TYPE: Optional[str] DEFAULT: 'transforms'

RETURNS DESCRIPTION
str

The ID of the added document. If the document had no ID, a new UUID-based ID is generated.

TYPE: str

Source code in healthchain/io/containers/document.py
def add_document_reference(
    self,
    document: DocumentReference,
    parent_id: Optional[str] = None,
    relationship_type: Optional[str] = "transforms",
) -> str:
    """
    Adds a DocumentReference resource to the FHIR bundle and establishes
    relationships between documents if a parent_id is provided. The relationship is
    tracked using the FHIR relatesTo element with a specified relationship type.
    See: https://build.fhir.org/documentreference-definitions.html#DocumentReference.relatesTo

    Args:
        document: The DocumentReference to add to the bundle
        parent_id: Optional ID of the parent document. If provided, establishes a
            relationship between this document and its parent.
        relationship_type: The type of relationship to establish with the parent
            document. Defaults to "transforms". This is used in the FHIR relatesTo
            element's code. See: http://hl7.org/fhir/valueset-document-relationship-type

    Returns:
        str: The ID of the added document. If the document had no ID, a new UUID-based
            ID is generated.
    """
    # Generate a consistent ID if not present
    if not document.id:
        document.id = f"doc-{uuid4()}"

    # Add relationship metadata if there's a parent
    if parent_id:
        if not hasattr(document, "relatesTo") or not document.relatesTo:
            document.relatesTo = []
        document.relatesTo.append(
            DocumentReferenceRelatesTo(
                target=Reference(reference=f"DocumentReference/{parent_id}"),
                code=create_single_codeable_concept(
                    code=relationship_type,
                    display=relationship_type.capitalize(),
                    system="http://hl7.org/fhir/ValueSet/document-relationship-type",
                ),
            )
        )

    self.add_resources([document], "DocumentReference", replace=False)

    return document.id

add_resources(resources, resource_type, replace=False)

Add resources to the working bundle.

Source code in healthchain/io/containers/document.py
def add_resources(
    self, resources: List[Any], resource_type: str, replace: bool = False
):
    """Add resources to the working bundle."""
    if not self._bundle:
        self._bundle = create_bundle()
    set_resources(self._bundle, resources, resource_type, replace=replace)

get_document_reference_family(document_id)

Get a DocumentReference resource and all its related resources based on the relatesTo element in the FHIR standard. See: https://build.fhir.org/documentreference-definitions.html#DocumentReference.relatesTo

PARAMETER DESCRIPTION
document_id

ID of the DocumentReference resource to find relationships for

TYPE: str

RETURNS DESCRIPTION
Dict[str, Any]

Dict containing: 'document': The requested DocumentReference resource 'parents': List of parent DocumentReference resources 'children': List of child DocumentReference resources 'siblings': List of DocumentReference resources sharing the same parent

Source code in healthchain/io/containers/document.py
def get_document_reference_family(self, document_id: str) -> Dict[str, Any]:
    """
    Get a DocumentReference resource and all its related resources
    based on the relatesTo element in the FHIR standard.
    See: https://build.fhir.org/documentreference-definitions.html#DocumentReference.relatesTo

    Args:
        document_id: ID of the DocumentReference resource to find relationships for

    Returns:
        Dict containing:
            'document': The requested DocumentReference resource
            'parents': List of parent DocumentReference resources
            'children': List of child DocumentReference resources
            'siblings': List of DocumentReference resources sharing the same parent
    """
    documents = self.get_resources("DocumentReference")
    family = {"document": None, "parents": [], "children": [], "siblings": []}

    # Find the requested document
    target_doc = next((doc for doc in documents if doc.id == document_id), None)
    if not target_doc:
        return family

    family["document"] = target_doc

    # Find direct relationships
    if hasattr(target_doc, "relatesTo") and target_doc.relatesTo:
        # Find parents from target's relationships
        for relation in target_doc.relatesTo:
            parent_ref = relation.target.reference
            parent_id = parent_ref.split("/")[-1]
            parent = next((doc for doc in documents if doc.id == parent_id), None)
            if parent:
                family["parents"].append(parent)

    # Find children and siblings
    for doc in documents:
        if not hasattr(doc, "relatesTo") or not doc.relatesTo:
            continue

        for relation in doc.relatesTo:
            target_ref = relation.target.reference
            related_id = target_ref.split("/")[-1]

            # Check if this doc is a child of our target
            if related_id == document_id:
                family["children"].append(doc)

            # For siblings, check if they share the same parent
            elif family["parents"] and related_id == family["parents"][0].id:
                if doc.id != document_id:  # Don't include self as sibling
                    family["siblings"].append(doc)

    return family

get_document_references_readable(include_data=True, include_relationships=True)

Get DocumentReferences resources with their content and optional relationship data in a human-readable dictionary format.

PARAMETER DESCRIPTION
include_data

If True, decode and include the document data (default: True)

TYPE: bool DEFAULT: True

include_relationships

If True, include related document information (default: True)

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
List[Dict[str, Any]]

List of documents with metadata and optionally their content and relationships

Source code in healthchain/io/containers/document.py
def get_document_references_readable(
    self, include_data: bool = True, include_relationships: bool = True
) -> List[Dict[str, Any]]:
    """
    Get DocumentReferences resources with their content and optional relationship data
    in a human-readable dictionary format.

    Args:
        include_data: If True, decode and include the document data (default: True)
        include_relationships: If True, include related document information (default: True)

    Returns:
        List of documents with metadata and optionally their content and relationships
    """
    documents = []
    for doc in self.get_resources("DocumentReference"):
        doc_data = {
            "id": doc.id,
            "description": doc.description,
            "status": doc.status,
        }

        attachments = read_content_attachment(doc, include_data=include_data)
        if attachments:
            doc_data["attachments"] = []
            for attachment in attachments:
                if include_data:
                    doc_data["attachments"].append(
                        {
                            "data": attachment.get("data"),
                            "metadata": attachment.get("metadata"),
                        }
                    )
                else:
                    doc_data["attachments"].append(
                        {"metadata": attachment.get("metadata")}
                    )

        if include_relationships:
            family = self.get_document_reference_family(doc.id)
            doc_data["relationships"] = {
                "parents": [
                    {"id": p.id, "description": p.description}
                    for p in family["parents"]
                ],
                "children": [
                    {"id": c.id, "description": c.description}
                    for c in family["children"]
                ],
                "siblings": [
                    {"id": s.id, "description": s.description}
                    for s in family["siblings"]
                ],
            }

        documents.append(doc_data)

    return documents

get_prefetch_resources(key)

Get resources of a specific type from the prefetch bundle.

Source code in healthchain/io/containers/document.py
def get_prefetch_resources(self, key: str) -> List[Any]:
    """Get resources of a specific type from the prefetch bundle."""
    if not self._prefetch_resources:
        return []
    return self._prefetch_resources.get(key, [])

get_resources(resource_type)

Get resources of a specific type from the working bundle.

Source code in healthchain/io/containers/document.py
def get_resources(self, resource_type: str) -> List[Any]:
    """Get resources of a specific type from the working bundle."""
    if not self._bundle:
        return []
    return get_resources(self._bundle, resource_type)

ModelOutputs dataclass

Container for storing and managing third-party integration model outputs.

This class stores outputs from different NLP/ML frameworks like Hugging Face and LangChain, organizing them by task type. It also maintains a list of generated text outputs across frameworks.

ATTRIBUTE DESCRIPTION
_huggingface_results

Dictionary storing Hugging Face model outputs, keyed by task name.

TYPE: Dict[str, Any]

_langchain_results

Dictionary storing LangChain outputs, keyed by task name.

TYPE: Dict[str, Any]

METHOD DESCRIPTION
add_output

str, task: str, output: Any): Adds a model output for a specific source and task. For text generation tasks, also extracts and stores the generated text.

get_output

str, task: str, default: Any = None) -> Any: Gets the model output for a specific source and task. Returns default if not found.

get_generated_text

Returns the list of generated text outputs

Source code in healthchain/io/containers/document.py
@dataclass
class ModelOutputs:
    """
    Container for storing and managing third-party integration model outputs.

    This class stores outputs from different NLP/ML frameworks like Hugging Face
    and LangChain, organizing them by task type. It also maintains a list of
    generated text outputs across frameworks.

    Attributes:
        _huggingface_results (Dict[str, Any]): Dictionary storing Hugging Face model
            outputs, keyed by task name.
        _langchain_results (Dict[str, Any]): Dictionary storing LangChain outputs,
            keyed by task name.

    Methods:
        add_output(source: str, task: str, output: Any): Adds a model output for a
            specific source and task. For text generation tasks, also extracts and
            stores the generated text.
        get_output(source: str, task: str, default: Any = None) -> Any: Gets the model
            output for a specific source and task. Returns default if not found.
        get_generated_text() -> List[str]: Returns the list of generated text outputs
    """

    _huggingface_results: Dict[str, Any] = field(default_factory=dict)
    _langchain_results: Dict[str, Any] = field(default_factory=dict)

    def add_output(self, source: str, task: str, output: Any):
        if source == "huggingface":
            self._huggingface_results[task] = output
        elif source == "langchain":
            self._langchain_results[task] = output
        else:
            raise ValueError(f"Unknown source: {source}")

    def get_output(self, source: str, task: str) -> Any:
        if source == "huggingface":
            return self._huggingface_results.get(task, {})
        elif source == "langchain":
            return self._langchain_results.get(task, {})
        raise ValueError(f"Unknown source: {source}")

    def get_generated_text(self, source: str, task: str) -> List[str]:
        """
        Returns generated text outputs for a given source and task.

        Handles different output formats for Hugging Face and LangChain. For
        Hugging Face, it extracts the last message content from chat-style
        outputs and common keys like "generated_text", "summary_text", and
        "translation". For LangChain, it converts JSON outputs to strings, and returns
        the output as is if it is already a string.

        Args:
            source (str): Framework name (e.g., "huggingface", "langchain").
            task (str): Task name for retrieving generated text.

        Returns:
            List[str]: List of generated text outputs, or an empty list if none.
        """
        generated_text = []

        if source == "huggingface":
            # Handle chat-style output format
            output = self._huggingface_results.get(task)
            if isinstance(output, list):
                for entry in output:
                    text = entry.get("generated_text")
                    if isinstance(text, list):
                        last_msg = text[-1]
                        if isinstance(last_msg, dict) and "content" in last_msg:
                            generated_text.append(last_msg["content"])
                    # Otherwise get common huggingface output keys
                    elif any(
                        key in entry
                        for key in ["generated_text", "summary_text", "translation"]
                    ):
                        generated_text.append(
                            text
                            or entry.get("summary_text")
                            or entry.get("translation")
                        )
            else:
                logger.warning("HuggingFace output is not a list of dictionaries. ")
        elif source == "langchain":
            output = self._langchain_results.get(task)
            # Check if output is a string
            if isinstance(output, str):
                generated_text.append(output)
            # Try to convert JSON to string
            elif isinstance(output, dict):
                try:
                    import json

                    output_str = json.dumps(output)
                    generated_text.append(output_str)
                except Exception:
                    logger.warning(
                        "LangChain output is not a string and could not be converted to JSON string. "
                        "Chains should output either a string or a JSON object."
                    )
            else:
                logger.warning(
                    "LangChain output is not a string. Chains should output either a string or a JSON object."
                )

        return generated_text

get_generated_text(source, task)

Returns generated text outputs for a given source and task.

Handles different output formats for Hugging Face and LangChain. For Hugging Face, it extracts the last message content from chat-style outputs and common keys like "generated_text", "summary_text", and "translation". For LangChain, it converts JSON outputs to strings, and returns the output as is if it is already a string.

PARAMETER DESCRIPTION
source

Framework name (e.g., "huggingface", "langchain").

TYPE: str

task

Task name for retrieving generated text.

TYPE: str

RETURNS DESCRIPTION
List[str]

List[str]: List of generated text outputs, or an empty list if none.

Source code in healthchain/io/containers/document.py
def get_generated_text(self, source: str, task: str) -> List[str]:
    """
    Returns generated text outputs for a given source and task.

    Handles different output formats for Hugging Face and LangChain. For
    Hugging Face, it extracts the last message content from chat-style
    outputs and common keys like "generated_text", "summary_text", and
    "translation". For LangChain, it converts JSON outputs to strings, and returns
    the output as is if it is already a string.

    Args:
        source (str): Framework name (e.g., "huggingface", "langchain").
        task (str): Task name for retrieving generated text.

    Returns:
        List[str]: List of generated text outputs, or an empty list if none.
    """
    generated_text = []

    if source == "huggingface":
        # Handle chat-style output format
        output = self._huggingface_results.get(task)
        if isinstance(output, list):
            for entry in output:
                text = entry.get("generated_text")
                if isinstance(text, list):
                    last_msg = text[-1]
                    if isinstance(last_msg, dict) and "content" in last_msg:
                        generated_text.append(last_msg["content"])
                # Otherwise get common huggingface output keys
                elif any(
                    key in entry
                    for key in ["generated_text", "summary_text", "translation"]
                ):
                    generated_text.append(
                        text
                        or entry.get("summary_text")
                        or entry.get("translation")
                    )
        else:
            logger.warning("HuggingFace output is not a list of dictionaries. ")
    elif source == "langchain":
        output = self._langchain_results.get(task)
        # Check if output is a string
        if isinstance(output, str):
            generated_text.append(output)
        # Try to convert JSON to string
        elif isinstance(output, dict):
            try:
                import json

                output_str = json.dumps(output)
                generated_text.append(output_str)
            except Exception:
                logger.warning(
                    "LangChain output is not a string and could not be converted to JSON string. "
                    "Chains should output either a string or a JSON object."
                )
        else:
            logger.warning(
                "LangChain output is not a string. Chains should output either a string or a JSON object."
            )

    return generated_text

NlpAnnotations dataclass

Container for NLP-specific annotations and results.

This class stores various NLP annotations and processing results from text analysis, including preprocessed text, tokens, named entities, embeddings and spaCy documents.

ATTRIBUTE DESCRIPTION
_preprocessed_text

The preprocessed version of the input text.

TYPE: str

_tokens

List of tokenized words from the text.

TYPE: List[str]

_entities

Named entities extracted from the text, with their labels and positions.

TYPE: List[Dict[str, Any]]

_embeddings

Vector embeddings generated from the text.

TYPE: Optional[List[float]]

_spacy_doc

The processed spaCy Doc object.

TYPE: Optional[Doc]

METHOD DESCRIPTION
add_spacy_doc

SpacyDoc): Processes a spaCy Doc to extract tokens and entities.

get_spacy_doc

Returns the stored spaCy Doc object.

get_tokens

Returns the list of tokens.

set_tokens

List[str]): Sets the token list.

set_entities

List[Dict[str, Any]]): Sets the named entities list.

get_entities

Returns the list of named entities.

get_embeddings

Returns the vector embeddings.

set_embeddings

List[float]): Sets the vector embeddings.

Source code in healthchain/io/containers/document.py
@dataclass
class NlpAnnotations:
    """
    Container for NLP-specific annotations and results.

    This class stores various NLP annotations and processing results from text analysis,
    including preprocessed text, tokens, named entities, embeddings and spaCy documents.

    Attributes:
        _preprocessed_text (str): The preprocessed version of the input text.
        _tokens (List[str]): List of tokenized words from the text.
        _entities (List[Dict[str, Any]]): Named entities extracted from the text, with their labels and positions.
        _embeddings (Optional[List[float]]): Vector embeddings generated from the text.
        _spacy_doc (Optional[SpacyDoc]): The processed spaCy Doc object.

    Methods:
        add_spacy_doc(doc: SpacyDoc): Processes a spaCy Doc to extract tokens and entities.
        get_spacy_doc() -> Optional[SpacyDoc]: Returns the stored spaCy Doc object.
        get_tokens() -> List[str]: Returns the list of tokens.
        set_tokens(tokens: List[str]): Sets the token list.
        set_entities(entities: List[Dict[str, Any]]): Sets the named entities list.
        get_entities() -> List[Dict[str, Any]]: Returns the list of named entities.
        get_embeddings() -> Optional[List[float]]: Returns the vector embeddings.
        set_embeddings(embeddings: List[float]): Sets the vector embeddings.
    """

    _preprocessed_text: str = ""
    _tokens: List[str] = field(default_factory=list)
    _entities: List[Dict[str, Any]] = field(default_factory=list)
    _embeddings: Optional[List[float]] = None
    _spacy_doc: Optional[SpacyDoc] = None

    def add_spacy_doc(self, doc: SpacyDoc):
        self._spacy_doc = doc
        self._tokens = [token.text for token in doc]
        self._entities = [
            {
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char,
            }
            for ent in doc.ents
        ]

    def get_spacy_doc(self) -> Optional[SpacyDoc]:
        return self._spacy_doc

    def get_tokens(self) -> List[str]:
        return self._tokens

    def set_tokens(self, tokens: List[str]):
        self._tokens = tokens

    def set_entities(self, entities: List[Dict[str, Any]]):
        self._entities = entities

    def get_entities(self) -> List[Dict[str, Any]]:
        return self._entities

    def get_embeddings(self) -> Optional[List[float]]:
        return self._embeddings

    def set_embeddings(self, embeddings: List[float]):
        self._embeddings = embeddings

featureschema

Feature schema definitions for FHIR to Dataset data conversion.

This module provides classes to define and manage feature schemas that map FHIR resources to pandas DataFrame columns for ML model deployment.

FeatureMapping

Bases: BaseModel

Maps a single feature to its FHIR source.

Source code in healthchain/io/containers/featureschema.py
class FeatureMapping(BaseModel):
    """Maps a single feature to its FHIR source."""

    name: str
    fhir_resource: str
    code: Optional[str] = None
    code_system: Optional[str] = None
    field: Optional[str] = None
    transform: Optional[str] = None
    dtype: str = "float64"
    required: bool = True
    unit: Optional[str] = None
    display: Optional[str] = None

    model_config = ConfigDict(extra="allow")

    @model_validator(mode="after")
    def validate_resource_requirements(self) -> "FeatureMapping":
        """Validate the feature mapping configuration based on resource type."""
        if self.fhir_resource == "Observation":
            if not self.code:
                raise ValueError(
                    f"Feature '{self.name}': Observation resources require a 'code'"
                )
            if not self.code_system:
                raise ValueError(
                    f"Feature '{self.name}': Observation resources require a 'code_system'"
                )
        elif self.fhir_resource == "Patient":
            if not self.field:
                raise ValueError(
                    f"Feature '{self.name}': Patient resources require a 'field'"
                )
        return self

    @classmethod
    def from_dict(cls, name: str, data: Dict[str, Any]) -> "FeatureMapping":
        """Create a FeatureMapping from a dictionary.

        Args:
            name: The feature name
            data: Dictionary containing feature configuration

        Returns:
            FeatureMapping instance
        """
        return cls(name=name, **data)

from_dict(name, data) classmethod

Create a FeatureMapping from a dictionary.

PARAMETER DESCRIPTION
name

The feature name

TYPE: str

data

Dictionary containing feature configuration

TYPE: Dict[str, Any]

RETURNS DESCRIPTION
FeatureMapping

FeatureMapping instance

Source code in healthchain/io/containers/featureschema.py
@classmethod
def from_dict(cls, name: str, data: Dict[str, Any]) -> "FeatureMapping":
    """Create a FeatureMapping from a dictionary.

    Args:
        name: The feature name
        data: Dictionary containing feature configuration

    Returns:
        FeatureMapping instance
    """
    return cls(name=name, **data)

validate_resource_requirements()

Validate the feature mapping configuration based on resource type.

Source code in healthchain/io/containers/featureschema.py
@model_validator(mode="after")
def validate_resource_requirements(self) -> "FeatureMapping":
    """Validate the feature mapping configuration based on resource type."""
    if self.fhir_resource == "Observation":
        if not self.code:
            raise ValueError(
                f"Feature '{self.name}': Observation resources require a 'code'"
            )
        if not self.code_system:
            raise ValueError(
                f"Feature '{self.name}': Observation resources require a 'code_system'"
            )
    elif self.fhir_resource == "Patient":
        if not self.field:
            raise ValueError(
                f"Feature '{self.name}': Patient resources require a 'field'"
            )
    return self

FeatureSchema

Bases: BaseModel

Schema defining how to extract features from FHIR resources.

Source code in healthchain/io/containers/featureschema.py
class FeatureSchema(BaseModel):
    """Schema defining how to extract features from FHIR resources."""

    name: str
    version: str
    features: Dict[str, FeatureMapping] = {}
    description: Optional[str] = None
    model_info: Optional[Dict[str, Any]] = None
    metadata: Optional[Dict[str, Any]] = None

    model_config = ConfigDict(extra="allow")

    @field_validator("features", mode="before")
    @classmethod
    def convert_feature_dicts(cls, v):
        """Convert feature dicts to FeatureMapping objects if needed."""
        if v and isinstance(v, dict):
            # Check if values are dicts (need conversion) or already FeatureMapping
            if v and isinstance(list(v.values())[0], dict):
                return {
                    name: FeatureMapping.from_dict(name, mapping)
                    for name, mapping in v.items()
                }
        return v

    @classmethod
    def from_yaml(cls, path: Union[str, Path]) -> "FeatureSchema":
        """Load schema from a YAML file.

        Args:
            path: Path to the YAML file

        Returns:
            FeatureSchema instance

        Example:
            >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
        """
        path = Path(path)
        with open(path, "r") as f:
            data = yaml.safe_load(f)

        return cls.model_validate(data)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "FeatureSchema":
        """Create a FeatureSchema from a dictionary.

        Args:
            data: Dictionary containing schema configuration

        Returns:
            FeatureSchema instance
        """
        return cls.model_validate(data)

    def to_dict(self) -> Dict[str, Any]:
        """Convert schema to dictionary format.

        Returns:
            Dictionary representation of the schema
        """
        result = {
            "name": self.name,
            "version": self.version,
            "description": self.description,
            "model_info": self.model_info,
            "features": {
                name: {
                    k: v
                    for k, v in mapping.model_dump().items()
                    if k != "name" and v is not None
                }
                for name, mapping in self.features.items()
            },
        }
        if self.metadata:
            result["metadata"] = self.metadata
        return result

    def to_yaml(self, path: Union[str, Path]) -> None:
        """Save schema to a YAML file.

        Args:
            path: Path where the YAML file will be saved
        """
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)

        with open(path, "w") as f:
            yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)

    def get_feature_names(self) -> List[str]:
        """Get list of feature names in order.

        Returns:
            List of feature names
        """
        return list(self.features.keys())

    def get_required_features(self) -> List[str]:
        """Get list of required feature names.

        Returns:
            List of required feature names
        """
        return [name for name, mapping in self.features.items() if mapping.required]

    def get_features_by_resource(self, resource_type: str) -> Dict[str, FeatureMapping]:
        """Get all features mapped to a specific FHIR resource type.

        Args:
            resource_type: FHIR resource type (e.g., "Observation", "Patient")

        Returns:
            Dictionary of features for the specified resource type
        """
        return {
            name: mapping
            for name, mapping in self.features.items()
            if mapping.fhir_resource == resource_type
        }

    def get_observation_codes(self) -> Dict[str, FeatureMapping]:
        """Get all Observation features with their codes.

        Returns:
            Dictionary mapping codes to feature mappings
        """
        observations = self.get_features_by_resource("Observation")
        return {
            mapping.code: mapping for mapping in observations.values() if mapping.code
        }

    def validate_dataframe_columns(self, columns: List[str]) -> Dict[str, Any]:
        """Validate that a DataFrame has the expected columns.

        Args:
            columns: List of column names from a DataFrame

        Returns:
            Dictionary with validation results:
                - valid: bool
                - missing_required: List of missing required features
                - unexpected: List of unexpected columns
        """
        expected = set(self.get_feature_names())
        actual = set(columns)
        required = set(self.get_required_features())

        missing_required = list(required - actual)
        unexpected = list(actual - expected)

        return {
            "valid": len(missing_required) == 0,
            "missing_required": missing_required,
            "unexpected": unexpected,
            "missing_optional": list((expected - required) - actual),
        }

convert_feature_dicts(v) classmethod

Convert feature dicts to FeatureMapping objects if needed.

Source code in healthchain/io/containers/featureschema.py
@field_validator("features", mode="before")
@classmethod
def convert_feature_dicts(cls, v):
    """Convert feature dicts to FeatureMapping objects if needed."""
    if v and isinstance(v, dict):
        # Check if values are dicts (need conversion) or already FeatureMapping
        if v and isinstance(list(v.values())[0], dict):
            return {
                name: FeatureMapping.from_dict(name, mapping)
                for name, mapping in v.items()
            }
    return v

from_dict(data) classmethod

Create a FeatureSchema from a dictionary.

PARAMETER DESCRIPTION
data

Dictionary containing schema configuration

TYPE: Dict[str, Any]

RETURNS DESCRIPTION
FeatureSchema

FeatureSchema instance

Source code in healthchain/io/containers/featureschema.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "FeatureSchema":
    """Create a FeatureSchema from a dictionary.

    Args:
        data: Dictionary containing schema configuration

    Returns:
        FeatureSchema instance
    """
    return cls.model_validate(data)

from_yaml(path) classmethod

Load schema from a YAML file.

PARAMETER DESCRIPTION
path

Path to the YAML file

TYPE: Union[str, Path]

RETURNS DESCRIPTION
FeatureSchema

FeatureSchema instance

Example

schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")

Source code in healthchain/io/containers/featureschema.py
@classmethod
def from_yaml(cls, path: Union[str, Path]) -> "FeatureSchema":
    """Load schema from a YAML file.

    Args:
        path: Path to the YAML file

    Returns:
        FeatureSchema instance

    Example:
        >>> schema = FeatureSchema.from_yaml("configs/features/sepsis_vitals.yaml")
    """
    path = Path(path)
    with open(path, "r") as f:
        data = yaml.safe_load(f)

    return cls.model_validate(data)

get_feature_names()

Get list of feature names in order.

RETURNS DESCRIPTION
List[str]

List of feature names

Source code in healthchain/io/containers/featureschema.py
def get_feature_names(self) -> List[str]:
    """Get list of feature names in order.

    Returns:
        List of feature names
    """
    return list(self.features.keys())

get_features_by_resource(resource_type)

Get all features mapped to a specific FHIR resource type.

PARAMETER DESCRIPTION
resource_type

FHIR resource type (e.g., "Observation", "Patient")

TYPE: str

RETURNS DESCRIPTION
Dict[str, FeatureMapping]

Dictionary of features for the specified resource type

Source code in healthchain/io/containers/featureschema.py
def get_features_by_resource(self, resource_type: str) -> Dict[str, FeatureMapping]:
    """Get all features mapped to a specific FHIR resource type.

    Args:
        resource_type: FHIR resource type (e.g., "Observation", "Patient")

    Returns:
        Dictionary of features for the specified resource type
    """
    return {
        name: mapping
        for name, mapping in self.features.items()
        if mapping.fhir_resource == resource_type
    }

get_observation_codes()

Get all Observation features with their codes.

RETURNS DESCRIPTION
Dict[str, FeatureMapping]

Dictionary mapping codes to feature mappings

Source code in healthchain/io/containers/featureschema.py
def get_observation_codes(self) -> Dict[str, FeatureMapping]:
    """Get all Observation features with their codes.

    Returns:
        Dictionary mapping codes to feature mappings
    """
    observations = self.get_features_by_resource("Observation")
    return {
        mapping.code: mapping for mapping in observations.values() if mapping.code
    }

get_required_features()

Get list of required feature names.

RETURNS DESCRIPTION
List[str]

List of required feature names

Source code in healthchain/io/containers/featureschema.py
def get_required_features(self) -> List[str]:
    """Get list of required feature names.

    Returns:
        List of required feature names
    """
    return [name for name, mapping in self.features.items() if mapping.required]

to_dict()

Convert schema to dictionary format.

RETURNS DESCRIPTION
Dict[str, Any]

Dictionary representation of the schema

Source code in healthchain/io/containers/featureschema.py
def to_dict(self) -> Dict[str, Any]:
    """Convert schema to dictionary format.

    Returns:
        Dictionary representation of the schema
    """
    result = {
        "name": self.name,
        "version": self.version,
        "description": self.description,
        "model_info": self.model_info,
        "features": {
            name: {
                k: v
                for k, v in mapping.model_dump().items()
                if k != "name" and v is not None
            }
            for name, mapping in self.features.items()
        },
    }
    if self.metadata:
        result["metadata"] = self.metadata
    return result

to_yaml(path)

Save schema to a YAML file.

PARAMETER DESCRIPTION
path

Path where the YAML file will be saved

TYPE: Union[str, Path]

Source code in healthchain/io/containers/featureschema.py
def to_yaml(self, path: Union[str, Path]) -> None:
    """Save schema to a YAML file.

    Args:
        path: Path where the YAML file will be saved
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    with open(path, "w") as f:
        yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)

validate_dataframe_columns(columns)

Validate that a DataFrame has the expected columns.

PARAMETER DESCRIPTION
columns

List of column names from a DataFrame

TYPE: List[str]

RETURNS DESCRIPTION
Dict[str, Any]

Dictionary with validation results: - valid: bool - missing_required: List of missing required features - unexpected: List of unexpected columns

Source code in healthchain/io/containers/featureschema.py
def validate_dataframe_columns(self, columns: List[str]) -> Dict[str, Any]:
    """Validate that a DataFrame has the expected columns.

    Args:
        columns: List of column names from a DataFrame

    Returns:
        Dictionary with validation results:
            - valid: bool
            - missing_required: List of missing required features
            - unexpected: List of unexpected columns
    """
    expected = set(self.get_feature_names())
    actual = set(columns)
    required = set(self.get_required_features())

    missing_required = list(required - actual)
    unexpected = list(actual - expected)

    return {
        "valid": len(missing_required) == 0,
        "missing_required": missing_required,
        "unexpected": unexpected,
        "missing_optional": list((expected - required) - actual),
    }