"""Data models for genomic aberrations and biallelic inactivation hits.
This module defines the core data structures used throughout the biallelic
inactivation analysis pipeline, including enums for aberration types and
classes for representing samples, donors, and genomic variations.
"""
from enum import Enum
from typing import Optional
from biallelic.misc import camel_case_split
[docs]
class Gender(Enum):
"""Biological sex classification for donor samples.
Attributes:
Unknown: Sex not specified or unknown (value: 0)
Male: Male donor (value: 1)
Female: Female donor (value: 2)
"""
Unknown = 0
Male = 1
Female = 2
[docs]
class OmicsType(Enum):
"""Data type for genomic analysis.
Represents the type of omics data being analyzed (genomic mutations,
epigenetic modifications, or gene expression).
Attributes:
Genomics: DNA sequence variants and structural changes (value: 1)
Methylomics: DNA methylation status (value: 2)
Transcriptomics: Gene expression data (value: 3)
"""
Genomics = 1
Methylomics = 2
Transcriptomics = 3
[docs]
class AberrationType(Enum):
"""Types of genomic aberrations detected in cancer samples.
Encompasses somatic mutations, structural variants, copy number changes,
germline variants, and epigenetic modifications.
Attributes:
SV: Structural variant (value: 1)
SNV: Single nucleotide variant (value: 2)
INDEL: Small insertion/deletion (value: 3)
CNN_LOH: Copy number neutral loss of heterozygosity (value: 4)
GAIN_LOH: Copy number gain with loss of heterozygosity (value: 5)
GERM_SNV: Germline single nucleotide variant (value: 6)
HOM_LOSS: Homozygous loss (deletion) (value: 7)
HET_LOSS: Heterozygous loss (deletion) (value: 8)
GERM_HET_LOSS: Germline heterozygous loss (value: 9)
GERM_HOM_LOSS: Germline homozygous loss (value: 10)
GERM_HOM_SNV: Germline homozygous SNV (value: 11)
METHYL: Promoter methylation silencing (value: 12)
AMP: High-level copy number amplification (value: 13)
GAIN: Low-level copy number gain (value: 14)
"""
SV = 1
SNV = 2
INDEL = 3
CNN_LOH = 4
GAIN_LOH = 5
GERM_SNV = 6
HOM_LOSS = 7
HET_LOSS = 8
GERM_HET_LOSS = 9
GERM_HOM_LOSS = 10
GERM_HOM_SNV = 11
METHYL = 12
AMP = 13
GAIN = 14
[docs]
class DoubleHitType(Enum):
"""Types of biallelic inactivation patterns detected.
Represents the combination of two independent hits on both alleles
of a gene, following the Knudson two-hit hypothesis for tumor suppressor
inactivation.
Attributes:
SomLoss_SomLoss: Two somatic copy number losses
SomLoss_SomSnv: Somatic loss + somatic SNV
SomCnLoh_SomSnv: Somatic CNN-LOH + somatic SNV
SomGainLoh_SomSnv: Somatic gain-LOH + somatic SNV
SomLoss_SomSv: Somatic loss + somatic structural variant
SomSnv_SomSnv: Two somatic SNVs
SomLoss_SomIndel: Somatic loss + somatic indel
GermLoss_SomLoss: Germline loss + somatic loss
GermLoss_GermLoss: Two germline losses
GermLoss_GermSnp: Germline loss + germline SNP
GermLoss_GermSv: Germline loss + germline structural variant
GermSnp_SomLoss: Germline SNP + somatic loss
GermSv_SomLoss: Germline SV + somatic loss
GermSnp_GermSnp: Two germline SNPs
SomLoss_Methyl: Somatic loss + promoter methylation
SubclonalLoss_SomSnv: Subclonal loss + somatic SNV
SomLoss_SubclonalSnv: Somatic loss + subclonal SNV
"""
SomLoss_SomLoss = 1
SomLoss_SomSnv = 2
SomCnLoh_SomSnv = 3
SomGainLoh_SomSnv = 4
SomLoss_SomSv = 5
SomSnv_SomSnv = 6
SomLoss_SomIndel = 7
GermLoss_SomLoss = 8
GermLoss_GermLoss = 9
GermLoss_GermSnp = 10
GermLoss_GermSv = 11
GermSnp_SomLoss = 12
GermSv_SomLoss = 13
GermSnp_GermSnp = 14
SomLoss_Methyl = 15
SubclonalLoss_SomSnv = 16
SomLoss_SubclonalSnv = 17
def __str__(self) -> str:
"""Return human-readable string representation of hit type.
Converts CamelCase enum name to snake_case with slash separator.
Example: SomLoss_SomSnv → som_loss/som_snv
Returns:
String representation with format "hit1/hit2"
"""
hits = self.name.split("_")
return "%s/%s" % (
"_".join(camel_case_split(hits[0])).lower(),
"_".join(camel_case_split(hits[1])).lower(),
)
[docs]
class SampleDonor:
"""Metadata for a donor sample in the analysis cohort.
Represents information about a biological sample and its source donor,
including gender, data type, tumor purity, and ploidy.
Attributes:
sample_id: Unique identifier for the sample
donor_id: Unique identifier for the donor
gender: Biological sex of the donor (from Gender enum)
omics: Type of omics data for this sample (from OmicsType enum)
cellularity: Tumor purity as fraction 0-1 (default: 0)
ploidy: Average ploidy of sample (default: 2 for diploid)
matching_sample_id: ID of matched normal sample if available (default: "")
Example:
>>> from biallelic.models import SampleDonor, Gender, OmicsType
>>> donor = SampleDonor(
... sample_id="TCGA-A1-A0SB-01",
... donor_id="TCGA-A1-A0SB",
... gender=Gender.Female,
... omics=OmicsType.Genomics,
... cellularity=0.8,
... ploidy=2.0,
... matching_sample_id="TCGA-A1-A0SB-10"
... )
"""
def __init__(
self,
sample_id: str,
donor_id: str,
gender: Gender,
omics: OmicsType = OmicsType.Genomics,
cellularity: float = 0,
ploidy: float = 2,
matching_sample_id: str = "",
) -> None:
"""Initialize a SampleDonor instance.
Args:
sample_id: Unique identifier for the sample
donor_id: Unique identifier for the donor
gender: Biological sex classification (Gender enum)
omics: Type of omics data (OmicsType enum, default: Genomics)
cellularity: Tumor purity fraction 0-1 (default: 0)
ploidy: Average ploidy level (default: 2.0)
matching_sample_id: Matched normal sample ID if available (default: "")
"""
self.sample_id = sample_id
self.donor_id = donor_id
self.gender = gender.name
self.omics = omics
self.cellularity = cellularity
self.ploidy = ploidy
self.matching_sample_id = matching_sample_id
[docs]
class Aberration:
"""A genomic aberration detected in a tumor sample.
Represents a single genomic event (SNV, indel, SV, copy number change,
methylation, etc.) in a specific genomic location within a sample.
Attributes:
chrom: Chromosome identifier (e.g., "1", "X", "MT")
start: 0-based start coordinate (integer)
end: 0-based end coordinate (integer)
type: Type of aberration (AberrationType enum name as string)
subtype: Functional consequence or specific type (e.g., "missense", "frameshift")
sample_id: Sample identifier where aberration was detected
vaf: Variant allele frequency 0-1 (Optional, None if unavailable)
n_copy: Copy number at this location (Optional, None if unavailable)
gene: Gene name or symbol ("." if intergenic)
id: Unique identifier for this aberration ("." if not available)
Example:
>>> from biallelic.models import Aberration, AberrationType
>>> ab = Aberration(
... chrom="17",
... start=7577121,
... end=7577121,
... aberration_type=AberrationType.SNV,
... aberration_subtype="missense",
... sample_id="TCGA-A1-A0SB-01",
... vaf=0.45,
... gene="TP53",
... id="rs1234567"
... )
"""
def __init__(
self,
chrom: str,
start: int,
end: int,
aberration_type: AberrationType,
aberration_subtype: str,
sample_id: str,
vaf: Optional[float] = None,
n_copy: Optional[int] = None,
gene: str = ".",
id: str = ".",
) -> None:
"""Initialize an Aberration instance.
Args:
chrom: Chromosome identifier
start: 0-based start coordinate
end: 0-based end coordinate
aberration_type: Type of aberration (AberrationType enum)
aberration_subtype: Functional consequence or type detail
sample_id: Sample where aberration was detected
vaf: Variant allele frequency 0-1 (default: None)
n_copy: Copy number at locus (default: None)
gene: Gene name/symbol or "." for intergenic (default: ".")
id: Unique identifier or "." if unavailable (default: ".")
"""
self.chrom = str(chrom)
self.start = int(start)
self.end = int(end)
self.type = str(aberration_type.name)
self.subtype = str(aberration_subtype)
self.sample_id = str(sample_id)
self.vaf = vaf
self.n_copy = n_copy
if self.vaf is not None:
self.vaf = float(self.vaf)
if self.n_copy is not None:
self.n_copy = int(self.n_copy)
self.gene = str(gene)
self.id = str(id)
[docs]
class DoubleHit:
"""A biallelic inactivation event in a tumor suppressor gene.
Represents two independent mutational hits on both alleles of a gene,
following the Knudson two-hit hypothesis for tumor suppressor inactivation.
This is the core output of the biallelic discovery analysis.
Attributes:
gene: Gene name or symbol affected by biallelic inactivation
cytoband: Cytogenetic band location of the gene (e.g., "17p13.1")
first_hit: Identifier for the first mutational event
first_hit_type: Type/category of the first hit (e.g., "SNV", "HOM_LOSS")
second_hit: Identifier for the second mutational event
second_hit_type: Type/category of the second hit
hit_type: Classification of hit combination (DoubleHitType enum as string)
sample_id: Sample identifier where the biallelic event was detected
donor_id: Donor/patient identifier
id: Unique identifier for this biallelic hit event ("." if unavailable)
Example:
>>> from biallelic.models import DoubleHit, DoubleHitType
>>> hit = DoubleHit(
... gene="TP53",
... cytoband="17p13.1",
... first_hit="rs1234567",
... first_hit_type="SNV",
... second_hit="LOSS_17p",
... second_hit_type="HOM_LOSS",
... hit_type=DoubleHitType.SomLoss_SomSnv,
... sample_id="TCGA-A1-A0SB-01",
... donor_id="TCGA-A1-A0SB",
... id="BH_000001"
... )
"""
def __init__(
self,
gene: str,
cytoband: str,
first_hit: str,
first_hit_type: str,
second_hit: str,
second_hit_type: str,
hit_type: DoubleHitType,
sample_id: str,
donor_id: str,
id: str = ".",
) -> None:
"""Initialize a DoubleHit instance.
Args:
gene: Gene name or symbol affected
cytoband: Cytogenetic band location (e.g., "17p13.1")
first_hit: Identifier/description of first mutational event
first_hit_type: Functional type of first hit
second_hit: Identifier/description of second mutational event
second_hit_type: Functional type of second hit
hit_type: Classification of hit pair combination (DoubleHitType)
sample_id: Sample identifier
donor_id: Donor/patient identifier
id: Unique identifier or "." if unavailable (default: ".")
"""
self.gene = str(gene)
self.cytoband = str(cytoband)
self.first_hit = str(first_hit)
self.first_hit_type = str(first_hit_type)
self.second_hit = str(second_hit)
self.second_hit_type = str(second_hit_type)
self.hit_type = str(hit_type)
self.sample_id = str(sample_id)
self.donor_id = str(donor_id)
self.id = str(id)