Source code for fv3config.fv3run._native

import sys
import logging
import contextlib
import resource
import functools
import subprocess
import inspect
import multiprocessing
import os
import tempfile
import warnings
import json
from ..config import write_run_directory, get_n_processes, dump, load
from .. import filesystem

STDOUT_FILENAME = "stdout.log"
STDERR_FILENAME = "stderr.log"
CONFIG_OUT_FILENAME = "fv3config.yml"
MPI_FLAGS = [
    "--allow-run-as-root",
    "--use-hwthread-cpus",
    "--mca",
    "btl_vader_single_copy_mechanism",
    "none",
]
RUNFILE_ENV_VAR = "FV3CONFIG_DEFAULT_RUNFILE"

logger = logging.getLogger("fv3run")


def call_via_subprocess(module):
    def decorator(func):
        signature = inspect.signature(func)

        def main(argv):
            args, kwargs = json.loads(argv[1])
            func(*args, **kwargs)

        @functools.wraps(func)
        def command(*args, **kwargs) -> str:
            # check that args and kwargs match func
            # raises TypeError if not
            signature.bind(*args, **kwargs)

            serialized = json.dumps([args, kwargs])
            return ["python", "-m", module, serialized]

        func.main = main
        func.command = command
        return func

    return decorator


[docs]@call_via_subprocess("fv3config.fv3run._native_main")
def run_native(
    config_dict_or_location, outdir, runfile=None, capture_output: bool = True
):
    """Run the FV3GFS model with the given configuration.

    Copies the resulting directory to a target location. Will use the Google cloud
    storage key at ``$GOOGLE_APPLICATION_CREDENTIALS`` by default. Requires the
    fv3gfs-python package.

    Args:
        config_dict_or_location (dict or str): a configuration dictionary, or a
            location (local or on Google cloud storage) of a yaml file containing
            a configuration dictionary
        outdir (str): location to copy the resulting run directory
        runfile (str, optional): Python model script to use in place of the default.
        capture_output (bool, optional): If true, then the stderr and stdout
            streams will be redirected to the files `outdir/stderr.log` and `outdir/stdout.log`
            respectively.
    """
    _set_stacksize_unlimited()
    with _temporary_directory(outdir) as localdir:
        config_out_filename = os.path.join(localdir, CONFIG_OUT_FILENAME)
        # we need to write the dict to the run directory for archival and also load
        # the dict, it ends up being convenient to do both at once
        config_dict = _get_config_dict_and_write(
            config_dict_or_location, config_out_filename
        )
        write_run_directory(config_dict, localdir)
        if runfile is not None:
            filesystem.get_file(
                runfile, os.path.join(localdir, os.path.basename(runfile))
            )
        with _output_stream_context(localdir, capture_output) as (stdout, stderr):
            n_processes = get_n_processes(config_dict)
            _run_experiment(
                localdir,
                n_processes,
                runfile=runfile,
                mpi_flags=_add_oversubscribe_if_necessary(MPI_FLAGS, n_processes),
                stdout=stdout,
                stderr=stderr,
            )


def _set_stacksize_unlimited():
    try:
        resource.setrlimit(
            resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)
        )
    except ValueError:
        warnings.warn(
            "could not remove stacksize limit, may run out of memory as a result"
        )


def _add_oversubscribe_if_necessary(mpi_flags, n_processes):
    try:
        cpu_count = multiprocessing.cpu_count()
        if cpu_count < n_processes:
            mpi_flags += ["--oversubscribe"]
    except NotImplementedError:
        warnings.warn(
            "could not determine cpu count, assuming number of processors"
            "is at least as many as number of MPI tasks"
        )
    return mpi_flags


@contextlib.contextmanager
def _temporary_directory(outdir):
    fs = filesystem.get_fs(outdir)
    if not filesystem.is_local_path(outdir):
        with tempfile.TemporaryDirectory() as tempdir:
            try:
                yield tempdir
            finally:
                logger.info("Copying output to %s", outdir)
                fs.makedirs(outdir, exist_ok=True)
                filesystem.put_directory(tempdir, outdir)
    else:
        fs.makedirs(outdir, exist_ok=True)
        yield outdir


def _captured_output_context(localdir):
    out_filename = os.path.join(localdir, STDOUT_FILENAME)
    err_filename = os.path.join(localdir, STDERR_FILENAME)
    with open(out_filename, "wb") as out_file, open(err_filename, "wb") as err_file:
        try:
            yield out_file, err_file
        except subprocess.CalledProcessError as e:
            logger.critical(
                "Experiment failed. " "Check %s and %s for logs.",
                STDOUT_FILENAME,
                STDERR_FILENAME,
            )
            raise e


def _uncaptured_output_context(localdir):
    try:
        yield sys.stdout, sys.stderr
    except subprocess.CalledProcessError as e:
        logger.critical("Experiment failed")
        raise e


@contextlib.contextmanager
def _output_stream_context(localdir: str, capture_output: bool):
    logger.info("running experiment")
    if capture_output:
        yield from _captured_output_context(localdir)
    else:
        yield from _uncaptured_output_context(localdir)


def _get_python_command(runfile):
    python_args = ["python3", "-m", "mpi4py"]
    if runfile is not None:
        python_args.append(os.path.basename(runfile))
    elif RUNFILE_ENV_VAR in os.environ:
        python_args.append(os.environ[RUNFILE_ENV_VAR])
    else:
        python_args += ["-m", "fv3gfs.run"]
    return python_args


def _run_experiment(
    dirname, n_processes, runfile, mpi_flags=None, stdout=None, stderr=None
):
    if mpi_flags is None:
        mpi_flags = []

    python_command = _get_python_command(runfile)
    logger.info("Running experiment in %s", dirname)
    subprocess.check_call(
        ["mpirun", "-n", str(n_processes)] + mpi_flags + python_command,
        cwd=dirname,
        stdout=stderr,
        stderr=stdout,
    )


def _get_config_dict_and_write(config_dict_or_location, config_out_filename):
    if isinstance(config_dict_or_location, dict):
        config_dict = config_dict_or_location
        with filesystem.open(config_out_filename, "w") as f:
            dump(config_dict, f)
    else:
        config_dict = _copy_and_load_config_dict(
            config_dict_or_location, config_out_filename
        )
    return config_dict


def _copy_and_load_config_dict(config_location, local_target_location):
    filesystem.get_file(config_location, local_target_location)
    with open(local_target_location, "r") as infile:
        config_dict = load(infile)
    return config_dict


if __name__ == "__main__":
    # In theory this warning should never be triggered.
    # There's probably a bug in run_native.command if it is.
    # Remove this main block after some time if it never gets triggered.
    warnings.warn(
        "calling fv3config.fv3run._native is deprecated, call fv3config.fv3run._native_main instead",
        DeprecationWarning,
    )
    run_native.main(sys.argv)