Source code for autolens.point.dataset

"""
Data structures for point-source strong lens observations.

Point-source lensing arises when the background source is compact enough to be treated
as a point (e.g. a quasar, supernova, or compact radio source).  Gravitational lensing
splits the source into multiple images whose positions, fluxes, and time delays constrain
the lens mass distribution.

``PointDataset`` holds the image-plane positions, fluxes, and time delays of a single
named point source together with their noise maps.  The ``name`` attribute is used to
pair this dataset with the corresponding ``Point`` model component during fitting.
When multiple point sources are fitted simultaneously (for example many multiply-imaged
background sources in a strong-lens cluster) they are collected in a plain Python
``list`` of ``PointDataset`` objects.

Two I/O surfaces are supported:

- JSON (via :func:`autoconf.output_to_json` / :func:`autoconf.from_json`) — exact
  round-trip, one file per ``PointDataset``; the canonical modeling input.
- CSV (via :meth:`PointDataset.to_csv` / :meth:`PointDataset.from_csv` and the
  module-level :func:`output_to_csv` / :func:`list_from_csv`) — one row per observed
  image, grouped by ``name``, so that tens or hundreds of cluster-scale point sources
  can be edited in a single spreadsheet.
"""
from autoconf import csvable
from typing import List, Tuple, Optional, Union

import autoarray as aa


_BASE_HEADERS = ["name", "y", "x", "positions_noise"]
_FLUX_HEADERS = ["flux", "flux_noise"]
_TIME_DELAY_HEADERS = ["time_delay", "time_delay_noise"]
_REDSHIFT_HEADERS = ["redshift"]



[docs]
class PointDataset:
    def __init__(
        self,
        name: str,
        positions: Union[aa.Grid2DIrregular, List[List], List[Tuple]],
        positions_noise_map: Union[float, aa.ArrayIrregular, List[float]],
        fluxes: Optional[Union[aa.ArrayIrregular, List[float]]] = None,
        fluxes_noise_map: Optional[Union[float, aa.ArrayIrregular, List[float]]] = None,
        time_delays: Optional[Union[aa.ArrayIrregular, List[float]]] = None,
        time_delays_noise_map: Optional[
            Union[float, aa.ArrayIrregular, List[float]]
        ] = None,
        redshift: Optional[float] = None,
    ):
        """
        A collection of the data component that can be used for point-source model-fitting, for example fitting the
        observed positions of a a strongly lensed quasar or supernovae or in strong lens cluster modeling, where
        there may be many tens or hundreds of individual source galaxies each of which are modeled as a point source.

        The name of the dataset is required for point-source model-fitting, as it pairs a point-source dataset with
        its corresponding point-source in the model-fit. For example, if a dataset has the name `source_1`, it will
        be paired with the `Point` model-component which has the name `source_1`. If a dataset component is not
        successfully paired with a model-component, an error is raised.

        Parameters
        ----------
        name
            The name of the point source dataset which is paired to a `Point` in the `Model`.
        positions
            The image-plane (y,x) positions of the point-source.
        positions_noise_map
            The noise-value of every (y,x) position, which is typically the pixel-scale of the data.
        fluxes
            The image-plane flux of each observed point-source of light.
        fluxes_noise_map
            The noise-value of every observed flux, which is typically measured from the pixel values of the pixel
            containing the point source after convolution with the PSF.
        time_delays
            The time delays of each observed point-source of light in days.
        time_delays_noise_map
            The noise-value of every observed time delay, which is typically measured from the time delay analysis.
        redshift
            The redshift of the source. Optional; when provided it is carried through CSV round-trips alongside
            the positions so cluster-scale workflows can encode per-source redshifts in a single spreadsheet.
        """

        self.name = name

        # Ensure positions is a Grid2DIrregular
        self.positions = (
            positions
            if isinstance(positions, aa.Grid2DIrregular)
            else aa.Grid2DIrregular(values=positions)
        )

        # Ensure positions_noise_map is an ArrayIrregular
        if isinstance(positions_noise_map, float):
            positions_noise_map = [positions_noise_map] * len(self.positions)

        self.positions_noise_map = (
            positions_noise_map
            if isinstance(positions_noise_map, aa.ArrayIrregular)
            else aa.ArrayIrregular(values=positions_noise_map)
        )

        def convert_to_array_irregular(values):
            """
            Convert data to ArrayIrregular if it is not already.
            """
            return (
                aa.ArrayIrregular(values=values)
                if values is not None and not isinstance(values, aa.ArrayIrregular)
                else values
            )

        # Convert fluxes, time delays and their noise maps to ArrayIrregular if provided as values and not already this type

        self.fluxes = convert_to_array_irregular(fluxes)
        self.fluxes_noise_map = convert_to_array_irregular(fluxes_noise_map)
        self.time_delays = convert_to_array_irregular(time_delays)
        self.time_delays_noise_map = convert_to_array_irregular(time_delays_noise_map)

        self.redshift = float(redshift) if redshift is not None else None

    @property
    def info(self) -> str:
        """
        A dictionary representation of this instance.

        Arrays are represented as lists or lists of lists.
        """
        info = f"name : {self.name}\n"
        info += f"positions : {self.positions}\n"
        info += f"positions_noise_map : {self.positions_noise_map}\n"
        info += f"fluxes : {self.fluxes}\n"
        info += f"fluxes_noise_map : {self.fluxes_noise_map}\n"
        info += f"time_delays : {self.time_delays}\n"
        info += f"time_delays_noise_map : {self.time_delays_noise_map}\n"
        info += f"redshift : {self.redshift}\n"
        return info


[docs]
    def extent_from(self, buffer: float = 0.1):
        y_max = max(self.positions[:, 0]) + buffer
        y_min = min(self.positions[:, 0]) - buffer
        x_max = max(self.positions[:, 1]) + buffer
        x_min = min(self.positions[:, 1]) - buffer

        return [y_min, y_max, x_min, x_max]



[docs]
    def to_csv(self, file_path: str):
        """
        Write this dataset to ``file_path`` as a CSV with one row per image.

        Optional flux / time-delay columns are included only when this dataset carries
        the corresponding values.  For multi-dataset output use :func:`output_to_csv`.
        """
        output_to_csv([self], file_path)



[docs]
    @classmethod
    def from_csv(
        cls, file_path: str, name: Optional[str] = None
    ) -> "PointDataset":
        """
        Load a single ``PointDataset`` from a CSV written by :meth:`to_csv` or
        :func:`output_to_csv`.

        Parameters
        ----------
        file_path
            Path to a CSV file with at minimum the columns
            ``name, y, x, positions_noise``.
        name
            The ``name`` group to load.  Must be provided when the CSV contains more
            than one ``name``; when the CSV contains exactly one group it is picked
            automatically.
        """
        datasets = list_from_csv(file_path)

        if not datasets:
            raise ValueError(
                f"CSV file {file_path!r} contained no PointDataset rows."
            )

        if name is None:
            if len(datasets) > 1:
                available = [d.name for d in datasets]
                raise ValueError(
                    f"CSV file {file_path!r} contains {len(datasets)} groups "
                    f"({available!r}); pass name= to select one."
                )
            return datasets[0]

        for dataset in datasets:
            if dataset.name == name:
                return dataset

        available = [d.name for d in datasets]
        raise ValueError(
            f"CSV file {file_path!r} has no group named {name!r}. "
            f"Available groups: {available!r}."
        )




def _optional_values(dataset: PointDataset, attr: str) -> Optional[List[float]]:
    values = getattr(dataset, attr)
    if values is None:
        return None
    return [float(v) for v in values]


def output_to_csv(datasets: List[PointDataset], file_path: str):
    """
    Write a list of ``PointDataset`` objects to a single CSV with one row per observed
    image.

    The base columns (``name, y, x, positions_noise``) are always written.  The
    optional ``flux``/``flux_noise``, ``time_delay``/``time_delay_noise`` and
    ``redshift`` columns are included when *any* dataset in ``datasets`` carries
    those values; datasets that do not carry them leave those cells blank.

    When written, every row in a given ``name`` group repeats the same ``redshift``
    value — the source redshift is a per-source property, not per-image.

    This is the hand-editable / spreadsheet form preferred for strong-lens cluster
    workflows with tens or hundreds of multiply-imaged sources.  For exact
    round-trip serialisation use ``output_to_json`` / ``from_json``.
    """
    include_flux = any(d.fluxes is not None for d in datasets)
    include_time_delay = any(d.time_delays is not None for d in datasets)
    include_redshift = any(d.redshift is not None for d in datasets)

    headers = list(_BASE_HEADERS)
    if include_flux:
        headers += _FLUX_HEADERS
    if include_time_delay:
        headers += _TIME_DELAY_HEADERS
    if include_redshift:
        headers += _REDSHIFT_HEADERS

    rows = []
    for dataset in datasets:
        positions = dataset.positions
        positions_noise = _optional_values(dataset, "positions_noise_map")
        fluxes = _optional_values(dataset, "fluxes")
        fluxes_noise = _optional_values(dataset, "fluxes_noise_map")
        time_delays = _optional_values(dataset, "time_delays")
        time_delays_noise = _optional_values(dataset, "time_delays_noise_map")

        for i in range(len(positions)):
            row = {
                "name": dataset.name,
                "y": float(positions[i][0]),
                "x": float(positions[i][1]),
                "positions_noise": positions_noise[i],
            }
            if include_flux:
                row["flux"] = "" if fluxes is None else fluxes[i]
                row["flux_noise"] = (
                    "" if fluxes_noise is None else fluxes_noise[i]
                )
            if include_time_delay:
                row["time_delay"] = (
                    "" if time_delays is None else time_delays[i]
                )
                row["time_delay_noise"] = (
                    "" if time_delays_noise is None else time_delays_noise[i]
                )
            if include_redshift:
                row["redshift"] = (
                    "" if dataset.redshift is None else dataset.redshift
                )
            rows.append(row)

    csvable.output_to_csv(rows, file_path, headers=headers)


def _float_column(
    group_rows: List[dict], column: str, group_name: str
) -> Optional[List[float]]:
    raw = [row.get(column, "") for row in group_rows]
    populated = [v for v in raw if v not in ("", None)]

    if not populated:
        return None

    if len(populated) != len(raw):
        raise ValueError(
            f"CSV group {group_name!r} has partially populated column "
            f"{column!r}; every row in the group must have a value or all be blank."
        )

    return [float(v) for v in raw]


def _group_redshift(
    group_rows: List[dict], group_name: str
) -> Optional[float]:
    raw = [row.get("redshift", "") for row in group_rows]
    populated = [v for v in raw if v not in ("", None)]

    if not populated:
        return None

    if len(populated) != len(raw):
        raise ValueError(
            f"CSV group {group_name!r} has partially populated column "
            f"'redshift'; every row in the group must have a value or all be blank."
        )

    values = [float(v) for v in populated]
    if any(v != values[0] for v in values):
        raise ValueError(
            f"CSV group {group_name!r} has inconsistent 'redshift' values "
            f"{values!r}; a source redshift must be identical across all of its "
            f"image rows."
        )

    return values[0]


def list_from_csv(file_path: str) -> List[PointDataset]:
    """
    Load a list of ``PointDataset`` objects from a CSV written by
    :func:`output_to_csv` (or :meth:`PointDataset.to_csv`).

    Rows are grouped by their ``name`` column — one ``PointDataset`` per distinct
    name, preserving the order of first appearance.  Optional per-image columns
    (``flux``/``flux_noise``, ``time_delay``/``time_delay_noise``) are carried through
    per-group: if every row in a group populates the column the values are loaded,
    if every row leaves it blank the corresponding attribute is set to ``None``, and
    any partial-population is rejected with a ``ValueError``.

    The optional ``redshift`` column is per-source (not per-image): every row within
    a group must share the same value.  A group with mixed or differing redshifts is
    rejected with a ``ValueError``.
    """
    rows = csvable.list_from_csv(file_path)

    if not rows:
        return []

    headers = list(rows[0].keys())

    for required in _BASE_HEADERS:
        if required not in headers:
            raise ValueError(
                f"CSV file {file_path!r} is missing required column {required!r}; "
                f"expected headers starting with {_BASE_HEADERS!r}."
            )

    groups: "dict[str, List[dict]]" = {}
    for row in rows:
        groups.setdefault(row["name"], []).append(row)

    has_flux_column = "flux" in headers
    has_flux_noise_column = "flux_noise" in headers
    has_time_delay_column = "time_delay" in headers
    has_time_delay_noise_column = "time_delay_noise" in headers
    has_redshift_column = "redshift" in headers

    datasets: List[PointDataset] = []
    for name, group_rows in groups.items():
        positions = [(float(r["y"]), float(r["x"])) for r in group_rows]
        positions_noise_map = [
            float(r["positions_noise"]) for r in group_rows
        ]

        fluxes = (
            _float_column(group_rows, "flux", name)
            if has_flux_column
            else None
        )
        fluxes_noise_map = (
            _float_column(group_rows, "flux_noise", name)
            if has_flux_noise_column
            else None
        )
        time_delays = (
            _float_column(group_rows, "time_delay", name)
            if has_time_delay_column
            else None
        )
        time_delays_noise_map = (
            _float_column(group_rows, "time_delay_noise", name)
            if has_time_delay_noise_column
            else None
        )
        redshift = (
            _group_redshift(group_rows, name)
            if has_redshift_column
            else None
        )

        datasets.append(
            PointDataset(
                name=name,
                positions=positions,
                positions_noise_map=positions_noise_map,
                fluxes=fluxes,
                fluxes_noise_map=fluxes_noise_map,
                time_delays=time_delays,
                time_delays_noise_map=time_delays_noise_map,
                redshift=redshift,
            )
        )

    return datasets