from itertools import chain
import re
import sys
from snakemake.io import is_flagged
from snakemake.linting import Linter, Lint, links, NAME_PATTERN
[docs]
class RuleLinter(Linter):
def item_desc_plain(self, rule):
return f"rule {rule.name} (line {rule.lineno}, {rule.snakefile})"
def item_desc_json(self, rule):
return {"rule": rule.name, "line": rule.lineno, "snakefile": rule.snakefile}
def lint_params_prefix(self, rule):
for param, value in rule.params.items():
if (
isinstance(value, str)
and value
and any(
f.startswith(value)
for f in chain(rule.input, rule.output)
if isinstance(f, str)
)
):
yield Lint(
title="Param {} is a prefix of input or output file but hardcoded".format(
param
),
body="If this is meant to represent a file path prefix, it will fail when running "
"workflow in environments without a shared filesystem. "
"Instead, provide a function that infers the appropriate prefix from the input or "
"output file, e.g.: lambda w, input: os.path.splitext(input[0])[0]",
links=[links.params, links.input_functions],
)
def lint_log_directive(self, rule):
if not rule.log and not rule.norun and not rule.is_handover:
yield Lint(
title="No log directive defined",
body="Without a log directive, all output will be printed "
"to the terminal. In distributed environments, this means "
"that errors are harder to discover. In local environments, "
"output of concurrent jobs will be mixed and become unreadable.",
links=[links.log],
)
def lint_not_used_params(
self,
rule,
valid_names={
"input",
"output",
"log",
"params",
"wildcards",
"threads",
"resources",
},
regex=re.compile(f"{{(?P<name>{NAME_PATTERN}).*?}}"),
):
if rule.shellcmd:
for match in regex.finditer(rule.shellcmd):
name = match.group("name")
before = match.start() - 1
after = match.end()
if name not in valid_names and (
not (before >= 0 and after < len(rule.shellcmd))
or (rule.shellcmd[before] != "{" and rule.shellcmd[after] != "}")
):
yield Lint(
title="Shell command directly uses variable {} from outside of the rule".format(
name
),
body="It is recommended to pass all files as input and output, and non-file parameters "
"via the params directive. Otherwise, provenance tracking is less accurate.",
links=[links.params],
)
def lint_long_run(self, rule):
func_code = rule.run_func.__code__.co_code
max_len = 70 if sys.version_info < (3, 11) else 210
if rule.is_run and len(func_code) > max_len:
yield Lint(
title="Migrate long run directives into scripts or notebooks",
body="Long run directives hamper workflow readability. Use the script or notebook directive instead. "
"Note that the script or notebook directive does not involve boilerplate. Similar to run, you "
"will have direct access to params, input, output, and wildcards."
"Only use the run directive for a handful of lines.",
links=[links.external_scripts, links.notebooks],
)
def lint_iofile_by_index(self, rule, regex=re.compile(r"(input|output)\[[0-9]+\]")):
if rule.shellcmd and regex.search(rule.shellcmd):
yield Lint(
title="Do not access input and output files individually by index in shell commands",
body="When individual access to input or output files is needed (i.e., just writing '{input}' "
"is impossible), use names ('{input.somename}') instead of index based access.",
links=[links.rules],
)
def lint_missing_software_definition(self, rule):
if (
not rule.norun
and not rule.is_handover
and not rule.is_run
and not (rule.conda_env or rule.container_img)
):
if rule.env_modules:
yield Lint(
title="Additionally specify a conda environment or container for each rule, environment modules are not enough",
body="While environment modules allow to document and deploy the required software on a certain "
"platform, they lock your workflow in there, disabling easy reproducibility on other machines "
"that don't have exactly the same environment modules. Hence env modules (which might be beneficial "
"in certain cluster environments), should always be complemented with equivalent conda "
"environments.",
links=[links.package_management, links.containers],
)
else:
yield Lint(
title="Specify a conda environment or container for each rule.",
body="This way, the used software for each specific step is documented, and "
"the workflow can be executed on any machine without prerequisites.",
links=[links.package_management, links.containers],
)