Source code for ase.io.runner.reader

"""RuNNer input.data support

Read files in RuNNer's input.data file format.

Contains
--------
* `read_runnerdata`: Read structures from a RuNNer input.data file.

Reference
---------
* [The online documentation of RuNNer 2.0](https://runner-suite.gitlab.io/runner2)

Contributors
------------
* Maintainer and Author: [Alexander Knoll](mailto:alexander.knoll@rub.de)
* Author: [Redouan El Haouari](mailto:redouan.elhaouari@rub.de)
"""

import re
from collections.abc import Iterator
from typing import TextIO

from ase.atoms import Atoms
from ase.utils import reader

from .runneratoms import (
    DEFAULT_ATOM_LAYOUT,
    PROP_NAME_DICT,
    RuNNerAtoms,
    Units,
)

# Regex pattern for vector properties like "forces(3)"
_vector_pattern = re.compile(r'([\w\-]+)\((\d+)\)')


def _parse_begin(begin_line: str) -> list[tuple[str, int]]:
    """Parse the begin line to determine per-atom property layout.

    Returns a list of (name, length) for all properties (vector or scalar).
    Scalars have length 1.

    The two default fields "positions" and "element" are skipped.
    """
    fields = begin_line.strip().split()[1:]
    layout: list[tuple[str, int]] = []

    if len(fields) == 0:
        return DEFAULT_ATOM_LAYOUT

    # skip the first two fields (position, element), since they are fixed.
    for field in fields[2:]:
        match = _vector_pattern.fullmatch(field)
        if match:
            name, length = match.group(1), int(match.group(2))
        else:
            name = field
            length = 1

        # Normalize property names by replacing them with the defaults defined
        # in the dictionary PROP_NAME_DICT.
        name = PROP_NAME_DICT.get(name, name)

        layout.append((name, length))

    return layout


def _parse_chunk(
    chunk: str, begin_line: str, input_units: Units
) -> RuNNerAtoms:
    """Parse a single chunk into a RuNNerAtoms object."""
    atom_layout = _parse_begin(begin_line)

    runneratoms = RuNNerAtoms(
        atom_layout=atom_layout,
        input_units=input_units,
        num_atom_columns=sum([i for _, i in atom_layout]) + 4,
    )

    for line in chunk.splitlines():
        runneratoms.parse_line(line)

    return runneratoms



[docs]
@reader
def read_runnerdata(
    infile: TextIO,
    index: int | slice = -1,
    input_units: Units = Units.ATOMIC,
    output_units: Units = Units.ASE,
) -> Iterator[Atoms]:
    """Parse all structures within a RuNNer input.data file.

    input.data files contain all structural information needed to train a
    Behler-Parrinello-type neural network potential, e.g. Cart. coordinates,
    atomic forces, and energies. This function reads the file object `infile`
    and returns the slice of structures given by `index`. All structures will
    be converted to ASE units by default.

    Parameters
    ----------
    infile:
        Python fileobj with the target input.data file.
    index:
        The slice of structures which should be returned. Returns only the last
        structure by default.
    input_units:
        The given input units. Can be 'Units.ASE' or 'Units.ATOMIC'.
    output_units:
        The desired output units. Can be 'Units.ASE' or 'Units.ATOMIC'.

    Yields
    ------
    images:
        All information about the structures within `index` of `infile`,
        including symbols, positions, atomic charges, and cell lattice. Every
        `Atoms` object has a `RunnerSinglePointCalculator` attached with
        additional information on the total energy, atomic forces, and total
        charge.

    """
    # First, split input.data into separate structure "chunks".
    read_infile = infile.read()
    # First chunk is discarded because it is the data before the
    # first "begin".
    begin_pattern = re.compile(r'begin.*\n')
    chunks = begin_pattern.split(read_infile)[1:]
    begin_lines = begin_pattern.findall(read_infile)

    # Second, only parse the chunks which the user asked for.
    for begin_line, chunk in zip(begin_lines[index], chunks[index]):
        runneratoms = _parse_chunk(chunk, begin_line, input_units)

        runneratoms.convert(output_units)
        yield runneratoms.to_ase_atoms()