"""Main orchestrator for biallelic inactivation analysis pipeline.
This module contains the Aberrations class which coordinates the entire
analysis workflow: loading manifest configuration, invoking data drivers,
organizing reference data, and executing discovery analyses.
"""
import yaml
import os
from typing import Dict, List
from biallelic.misc import xopen, package_modules, get_module_method
from biallelic import drivers, discovery
[docs]
class Aberrations:
"""Orchestrates biallelic inactivation discovery from YAML manifest.
Loads a YAML manifest file that specifies:
- Reference datasets (gene annotations, sample metadata)
- Input files (genomic data in various formats)
- Analyses to run (discovery algorithms to execute)
The class dynamically loads appropriate drivers for each file format
and invokes discovery analyses to identify biallelic inactivation patterns.
Attributes:
manifest_file: Path to the YAML manifest file
manifest_content: Parsed YAML manifest as dictionary
data_path: Directory containing manifest (relative path base)
logger: SimpleLogger for logging operations
drivers_map: Mapping of driver names to module paths
discovery_map: Mapping of analysis names to module paths
aberration_list: List of loaded aberration DataFrames
reference_map: Reference datasets (gene models, sample metadata)
title: Analysis title from manifest
Example:
>>> from biallelic.bi import Aberrations
>>> from biallelic.logging import SimpleLogger
>>> logger = SimpleLogger("analysis", "/path/to/logs")
>>> aberrations = Aberrations("/path/to/manifest.yaml", logger)
>>> aberrations.biallelic_inactivations("/path/to/output")
"""
def __init__(self, manifest_file: str, logger) -> None:
"""Initialize Aberrations processor from YAML manifest.
Loads the manifest file, discovers available drivers and analyses,
loads reference datasets, and prepares input data.
Args:
manifest_file: Path to YAML manifest configuration file
logger: SimpleLogger instance for logging operations
(biallelic.logging.SimpleLogger)
Raises:
FileNotFoundError: If manifest file not found
yaml.YAMLError: If manifest file is not valid YAML
KeyError: If required fields missing from manifest
ValueError: If sample_donors reference not defined (required)
Exception: If driver or analysis loading fails
"""
self.logger = logger
self.manifest_file = manifest_file
self.data_path = os.path.dirname(self.manifest_file)
self.logger.log.info("Parse MANIFEST from %s" % manifest_file)
with xopen(self.manifest_file, "rb") as manifest:
self.manifest_content = yaml.safe_load(manifest)
available_drivers = package_modules(drivers)
available_discovery = package_modules(discovery)
self.drivers_map = {d.split(".")[-1]: d for d in available_drivers}
self.discovery_map = {d.split(".")[-1]: d for d in available_discovery}
self.aberration_list = []
self.reference_map = {}
self.load_refs()
self.load_contents()
self.title = self.manifest_content["title"]
[docs]
def load_refs(self) -> None:
"""Load reference datasets specified in manifest.
Iterates through manifest "ref" section, invokes appropriate drivers
for each reference type (genes, sample_donors, etc.), and stores
results in reference_map.
Raises:
ValueError: If sample_donors reference is not defined
"""
for ref_item in self.manifest_content["ref"]:
ref_metadata = self.manifest_content["ref"][ref_item]
if os.path.isabs(ref_metadata["path"]):
ref_path = ref_metadata["path"]
else:
ref_path = os.path.abspath(
os.path.join(self.data_path, ref_metadata["path"])
)
if ref_metadata["format_driver"] in self.drivers_map:
sub_logger = self.logger.add_log(
"ref_%s_%s" % (ref_item, ref_metadata["format_driver"])
)
driver_method = get_module_method(
drivers, ref_metadata["format_driver"], ref_item
)
if driver_method is None:
sub_logger.log.error(
"Driver %s doesn't implement %s"
% (ref_metadata["format_driver"], ref_item)
)
else:
self.reference_map[ref_item] = driver_method(
ref_path, sub_logger.log
)
else:
self.logger.log.error(
"Driver %s not supported, please write one"
% ref_metadata["format_driver"]
)
if "sample_donors" not in self.reference_map:
no_sample_donor_info_msg = (
'The "sample_donors" annotation is missing. '
"This means that some of the discovery analyses will "
"not be able to match samples and patients. Please add a "
'"sample_donors" in the metadata, in the "ref" section'
)
self.logger.log.error(no_sample_donor_info_msg)
raise ValueError(no_sample_donor_info_msg)
[docs]
def load_contents(self) -> None:
"""Load input data files specified in manifest.
Iterates through manifest "input" section, invokes appropriate drivers
for each file format, and stores loaded aberration data in
aberration_list.
"""
for input_item in self.manifest_content["input"]:
driver = input_item["format_driver"]
if driver in self.drivers_map:
sub_logger = self.logger.add_log(
"%s_%s" % (input_item["type"], driver)
)
try:
extra_args = input_item["extra_driver_args"]
except KeyError:
extra_args = {}
if os.path.isabs(input_item["path"]):
input_path = input_item["path"]
else:
input_path = os.path.join(
self.data_path, input_item["path"]
)
self.load_aberration(
driver,
input_item["type"],
input_path,
extra_args,
sub_logger,
)
else:
self.logger.log.error(
"Driver %s not supported, please write one" % driver
)
[docs]
def load_aberration(
self, driver: str, input_type: str, input_path: str,
extra_args: Dict, logger
) -> None:
"""Load aberrations from a single input file using appropriate driver.
Args:
driver: Name of the driver module to use
input_type: Type of aberrations in file (snv, indel, scna, etc.)
input_path: Path to input file
extra_args: Additional arguments to pass to driver function
logger: SubLogger for this load operation
"""
driver_method = get_module_method(drivers, driver, input_type)
if driver_method is None:
logger.log.error(
"Driver %s doesn't implement %s" % (driver, input_type)
)
else:
self.aberration_list.append(
driver_method(
input_path, logger.log, self.reference_map, **extra_args
)
)
[docs]
def biallelic_inactivations(self, output_path: str) -> None:
"""Execute discovery analyses to identify biallelic inactivations.
Iterates through manifest "analyses" section, invokes each discovery
analysis with loaded aberrations and reference data, writing results
to output directory.
Args:
output_path: Directory where analysis output files will be written
"""
for analysis in self.manifest_content["analyses"]:
if analysis["name"] in self.discovery_map:
analysis_method = get_module_method(
discovery, analysis["name"], "main"
)
sub_logger = self.logger.add_log(
"discovery_%s" % analysis["name"]
)
if analysis_method is None:
sub_logger.log.error(
"Discovery analysis %s doesn't implement a method named 'main'"
% analysis["name"]
)
else:
analysis_method(
self.aberration_list,
output_path,
self.reference_map,
self.title,
sub_logger.log,
)
else:
self.logger.log.error(
"Analysis %s not supported, please write it"
% analysis["name"]
)