Source code for biallelic.bi

"""Main orchestrator for biallelic inactivation analysis pipeline.

This module contains the Aberrations class which coordinates the entire
analysis workflow: loading manifest configuration, invoking data drivers,
organizing reference data, and executing discovery analyses.
"""

import yaml
import os
from typing import Dict, List
from biallelic.misc import xopen, package_modules, get_module_method
from biallelic import drivers, discovery


[docs] class Aberrations: """Orchestrates biallelic inactivation discovery from YAML manifest. Loads a YAML manifest file that specifies: - Reference datasets (gene annotations, sample metadata) - Input files (genomic data in various formats) - Analyses to run (discovery algorithms to execute) The class dynamically loads appropriate drivers for each file format and invokes discovery analyses to identify biallelic inactivation patterns. Attributes: manifest_file: Path to the YAML manifest file manifest_content: Parsed YAML manifest as dictionary data_path: Directory containing manifest (relative path base) logger: SimpleLogger for logging operations drivers_map: Mapping of driver names to module paths discovery_map: Mapping of analysis names to module paths aberration_list: List of loaded aberration DataFrames reference_map: Reference datasets (gene models, sample metadata) title: Analysis title from manifest Example: >>> from biallelic.bi import Aberrations >>> from biallelic.logging import SimpleLogger >>> logger = SimpleLogger("analysis", "/path/to/logs") >>> aberrations = Aberrations("/path/to/manifest.yaml", logger) >>> aberrations.biallelic_inactivations("/path/to/output") """ def __init__(self, manifest_file: str, logger) -> None: """Initialize Aberrations processor from YAML manifest. Loads the manifest file, discovers available drivers and analyses, loads reference datasets, and prepares input data. Args: manifest_file: Path to YAML manifest configuration file logger: SimpleLogger instance for logging operations (biallelic.logging.SimpleLogger) Raises: FileNotFoundError: If manifest file not found yaml.YAMLError: If manifest file is not valid YAML KeyError: If required fields missing from manifest ValueError: If sample_donors reference not defined (required) Exception: If driver or analysis loading fails """ self.logger = logger self.manifest_file = manifest_file self.data_path = os.path.dirname(self.manifest_file) self.logger.log.info("Parse MANIFEST from %s" % manifest_file) with xopen(self.manifest_file, "rb") as manifest: self.manifest_content = yaml.safe_load(manifest) available_drivers = package_modules(drivers) available_discovery = package_modules(discovery) self.drivers_map = {d.split(".")[-1]: d for d in available_drivers} self.discovery_map = {d.split(".")[-1]: d for d in available_discovery} self.aberration_list = [] self.reference_map = {} self.load_refs() self.load_contents() self.title = self.manifest_content["title"]
[docs] def load_refs(self) -> None: """Load reference datasets specified in manifest. Iterates through manifest "ref" section, invokes appropriate drivers for each reference type (genes, sample_donors, etc.), and stores results in reference_map. Raises: ValueError: If sample_donors reference is not defined """ for ref_item in self.manifest_content["ref"]: ref_metadata = self.manifest_content["ref"][ref_item] if os.path.isabs(ref_metadata["path"]): ref_path = ref_metadata["path"] else: ref_path = os.path.abspath( os.path.join(self.data_path, ref_metadata["path"]) ) if ref_metadata["format_driver"] in self.drivers_map: sub_logger = self.logger.add_log( "ref_%s_%s" % (ref_item, ref_metadata["format_driver"]) ) driver_method = get_module_method( drivers, ref_metadata["format_driver"], ref_item ) if driver_method is None: sub_logger.log.error( "Driver %s doesn't implement %s" % (ref_metadata["format_driver"], ref_item) ) else: self.reference_map[ref_item] = driver_method( ref_path, sub_logger.log ) else: self.logger.log.error( "Driver %s not supported, please write one" % ref_metadata["format_driver"] ) if "sample_donors" not in self.reference_map: no_sample_donor_info_msg = ( 'The "sample_donors" annotation is missing. ' "This means that some of the discovery analyses will " "not be able to match samples and patients. Please add a " '"sample_donors" in the metadata, in the "ref" section' ) self.logger.log.error(no_sample_donor_info_msg) raise ValueError(no_sample_donor_info_msg)
[docs] def load_contents(self) -> None: """Load input data files specified in manifest. Iterates through manifest "input" section, invokes appropriate drivers for each file format, and stores loaded aberration data in aberration_list. """ for input_item in self.manifest_content["input"]: driver = input_item["format_driver"] if driver in self.drivers_map: sub_logger = self.logger.add_log( "%s_%s" % (input_item["type"], driver) ) try: extra_args = input_item["extra_driver_args"] except KeyError: extra_args = {} if os.path.isabs(input_item["path"]): input_path = input_item["path"] else: input_path = os.path.join( self.data_path, input_item["path"] ) self.load_aberration( driver, input_item["type"], input_path, extra_args, sub_logger, ) else: self.logger.log.error( "Driver %s not supported, please write one" % driver )
[docs] def load_aberration( self, driver: str, input_type: str, input_path: str, extra_args: Dict, logger ) -> None: """Load aberrations from a single input file using appropriate driver. Args: driver: Name of the driver module to use input_type: Type of aberrations in file (snv, indel, scna, etc.) input_path: Path to input file extra_args: Additional arguments to pass to driver function logger: SubLogger for this load operation """ driver_method = get_module_method(drivers, driver, input_type) if driver_method is None: logger.log.error( "Driver %s doesn't implement %s" % (driver, input_type) ) else: self.aberration_list.append( driver_method( input_path, logger.log, self.reference_map, **extra_args ) )
[docs] def biallelic_inactivations(self, output_path: str) -> None: """Execute discovery analyses to identify biallelic inactivations. Iterates through manifest "analyses" section, invokes each discovery analysis with loaded aberrations and reference data, writing results to output directory. Args: output_path: Directory where analysis output files will be written """ for analysis in self.manifest_content["analyses"]: if analysis["name"] in self.discovery_map: analysis_method = get_module_method( discovery, analysis["name"], "main" ) sub_logger = self.logger.add_log( "discovery_%s" % analysis["name"] ) if analysis_method is None: sub_logger.log.error( "Discovery analysis %s doesn't implement a method named 'main'" % analysis["name"] ) else: analysis_method( self.aberration_list, output_path, self.reference_map, self.title, sub_logger.log, ) else: self.logger.log.error( "Analysis %s not supported, please write it" % analysis["name"] )