Source code for snakemake.ioutils.lookup

from abc import ABC, abstractmethod
from collections.abc import Mapping
from functools import partial
import re
from typing import List, Optional, Union

import snakemake.io
import snakemake.utils
from snakemake.exceptions import LookupError



[docs]
class WildcardHandlerBase(ABC):
    fmt_regex = re.compile(r"\{(?P<stmt>[^\{][^\{\}]+)\}[^\}]")

    def __init__(self, func, **namespace):
        self.func = func
        self.namespace = namespace


[docs]
    def needs_wildcards(self, expression):
        return callable(expression) or any(
            name not in self.namespace
            for name in snakemake.io.get_wildcard_names(expression)
        )



[docs]
    @abstractmethod
    def apply_func(self, expression, namespace=None): ...



[docs]
    def handle(self, expression):
        if self.needs_wildcards(expression) or any(
            callable(value) for value in self.namespace.values()
        ):

            def inner(wildcards):
                if self.namespace:
                    # add wildcard values to namespace
                    # do not override namespace
                    # (as it has been chosen explicitly by the dev)
                    namespace = dict(self.namespace)
                    for name, value in list(namespace.items()):
                        # resolve callables given in namespace
                        if callable(value):
                            namespace[name] = value(wildcards)
                    for name, value in wildcards.items():
                        if name not in namespace:
                            namespace[name] = value
                else:
                    namespace = wildcards
                if callable(expression):
                    resolved_expression = expression(wildcards)
                else:
                    resolved_expression = expression
                resolved_expression = snakemake.utils.format(
                    resolved_expression, **namespace
                )
                return self.apply_func(resolved_expression, namespace)

            return inner
        else:
            if self.namespace:
                resolved_expression = snakemake.utils.format(
                    expression, **self.namespace
                )
            else:
                resolved_expression = expression
            return self.apply_func(resolved_expression, self.namespace)





[docs]
class DpathWildcardHandler(WildcardHandlerBase):

[docs]
    def apply_func(self, expression, namespace=None):
        return self.func(expression)





[docs]
class QueryWildcardHandler(WildcardHandlerBase):
    def __init__(self, func, cols=None, is_nrows=None, **namespace):
        super().__init__(func, **namespace)
        self.cols = cols
        self.is_nrows = is_nrows


[docs]
    def needs_wildcards(self, expression):
        if super().needs_wildcards(expression):
            return True
        if self.cols is None:
            return False
        if isinstance(self.cols, list):
            return any(
                super(QueryWildcardHandler, self).needs_wildcards(col)
                for col in self.cols
            )
        else:
            return super().needs_wildcards(self.cols)



[docs]
    def apply_func(self, expression, namespace=None):
        cols = self.cols
        if self.cols is not None and namespace is not None:
            if isinstance(self.cols, list):
                cols = [snakemake.utils.format(col, **namespace) for col in self.cols]
            else:
                cols = snakemake.utils.format(self.cols, **namespace)
        return self.func(expression, cols=cols, is_nrows=self.is_nrows)




NODEFAULT = object()



[docs]
def lookup(
    dpath: Optional[str] = None,
    query: Optional[str] = None,
    cols: Optional[Union[List[str], str]] = None,
    is_nrows: Optional[int] = None,
    within=None,
    default=NODEFAULT,
    **namespace,
):
    """Lookup values in a pandas dataframe, series, or python mapping (e.g. dict).

    Required argument ``within`` should be a pandas dataframe or series (in which
    case use ``query``, and optionally ``cols`` and ``is_nrows``), or a Python
    mapping like a dict (in which case use the ``dpath`` argument is used).

    In case of a pandas dataframe (see https://pandas.pydata.org),
    the query parameter is passed to DataFrame.query().
    If the query results in multiple rows, the result is returned as a list of
    named tuples with the column names as attributes.
    If the query results in a single row, the result is returned as a single
    named tuple with the column names as attributes.
    In both cases, the result can be used by the expand or collect function,
    e.g. `collect("results/{item.sample}.txt", sample=lookup(query="someval > 2", within=samples))`.
    Since the result, in any case, also evaluates to True if it is not empty
    when interpreted as a boolean by Python, it can also be used as a condition
    for the branch function, e.g.
    ``branch(lookup(query="sample == '{sample}' & someval > 2", within=samples), then="foo", otherwise="bar")``.
    In case your dataframe has an index, you can also access the index within the
    query, e.g. for faster, constant time lookups: ``lookup(query="index.loc[{sample}]", within=samples)``.
    Further, it is possible to constrain the output to a list of columns, e.g.
    ``lookup(query="sample == '{sample}'", within=samples, cols=["somecolumn"])`` or to
    a single column, e.g.
    ``lookup(query="sample == '{sample}'", within=samples, cols="somecolumn")``.
    In the latter case, just a list of items in that column is returned.
    Finally, if the integer argument ``is_nrows`` is used, this returns true
    if there are that many rows in the query results, false otherwise.

    In case of a pandas series, the series is converted into a dataframe via
    Series.to_frame() and the same logic as for a dataframe is applied.

    In case of a python mapping, the ``dpath`` parameter is passed to
    ``dpath.values()`` (see https://github.com/dpath-maintainers/dpath-python),
    and the ``query``, ``cols``, and ``is_nrows`` arguments are ignored. If the
    dpath is not found, a ``LookupError`` is raised, unless a default fallback
    value is provided via the ``default`` argument.

    Query, dpath and cols may contain wildcards (e.g. {sample}).
    In that case, this function returns a Snakemake input function which takes
    wildcards as its only argument and will be evaluated by Snakemake
    once the wildcard values are known.

    In addition to wildcard values, dpath, query and cols may refer via the same syntax
    to auxiliary namespace arguments given to the lookup function, e.g.
    ``lookup(query="cell_type == '{sample.cell_type}'", within=samples, sample=lookup("sample == '{sample}'", within=samples))``
    This way, one can e.g. pass additional variables or chain lookups into more complex queries.
    """
    error = partial(LookupError, query=query, dpath=dpath)

    if within is None:
        raise error(
            msg="Must provide a dataframe, series, or mapping to search within."
        )
    if cols is not None and not isinstance(cols, (str, list)):
        raise error(msg="The cols argument has to be either a str or a list of str.")
    if is_nrows is not None and not isinstance(is_nrows, int):
        raise error(msg="The is_nrows argument has to be an int.")

    if query is not None:
        if isinstance(within, Mapping):
            raise error(
                msg="Query parameter can only be used with pandas DataFrame or Series objects."
            )

        import pandas as pd

        if isinstance(within, pd.Series):
            within = within.to_frame()

        def do_query(query, cols=None, is_nrows=None):
            try:
                res = within.query(query)
            except Exception as e:
                raise LookupError(query=query, exc=e)

            if is_nrows is not None:
                return is_nrows == len(res)
            if cols is not None:
                res = res[cols]
                if not isinstance(cols, list):
                    # single column select, just return a list of values
                    return res.to_list()
            res = list(res.itertuples(index=cols is None))
            if len(res) == 1:
                # just return the item if it is only one
                return res[0]
            return res

        return QueryWildcardHandler(
            do_query, cols=cols, is_nrows=is_nrows, **namespace
        ).handle(query)

    elif dpath is not None:
        if not isinstance(within, Mapping):
            raise error(
                msg="Dpath parameter can only be used with python mapping (e.g. dict)."
            )
        import dpath as dp

        def do_dpath(dpath):
            try:
                return dp.get(within, dpath)
            except ValueError:
                return dp.values(within, dpath)
            except KeyError:
                if default is not NODEFAULT:
                    return default
                raise LookupError(dpath=dpath, msg="Dpath not found.")

        return DpathWildcardHandler(do_dpath, **namespace).handle(dpath)
    else:
        raise error("Must provide either a query or dpath parameter.")