CancerAtHomeV2 / backend /pipeline /variant_caller.py
Mentors4EDU's picture
Upload 33 files
7a92197 verified
"""
Variant Calling Pipeline
Process sequencing data to identify genetic variants
"""
from pathlib import Path
from typing import Dict, List, Optional
import yaml
import logging
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class Variant:
"""Represents a genetic variant"""
chromosome: str
position: int
reference: str
alternate: str
quality: float
depth: int
allele_frequency: float
gene: Optional[str] = None
consequence: Optional[str] = None
class VariantCaller:
"""Call variants from sequencing data"""
def __init__(self, config_path: str = "config.yml"):
with open(config_path, 'r') as f:
self.config = yaml.safe_load(f)['pipeline']['variant_calling']
self.min_coverage = self.config['min_coverage']
self.min_allele_frequency = self.config['min_allele_frequency']
self.output_dir = Path(self.config['output_dir'])
self.output_dir.mkdir(parents=True, exist_ok=True)
def call_variants(
self,
alignment_file: Path,
reference_genome: Path,
output_vcf: Optional[Path] = None
) -> Path:
"""
Call variants from aligned sequencing data
Args:
alignment_file: BAM/SAM alignment file
reference_genome: Reference genome FASTA
output_vcf: Output VCF file
Returns:
Path to VCF file
"""
if output_vcf is None:
output_vcf = self.output_dir / f"{alignment_file.stem}_variants.vcf"
logger.info(f"Calling variants from {alignment_file.name}")
# Simulate variant calling for demo
# In production, use tools like GATK, FreeBayes, or BCFtools
variants = self._simulate_variant_calling()
# Write VCF
self._write_vcf(variants, output_vcf)
logger.info(f"Identified {len(variants)} variants")
return output_vcf
def _simulate_variant_calling(self) -> List[Variant]:
"""Simulate variant calling for demo purposes"""
# Common cancer-associated variants
variants = [
Variant('chr17', 7577538, 'C', 'T', 35.2, 50, 0.45, 'TP53', 'missense'),
Variant('chr7', 140453136, 'A', 'T', 42.1, 65, 0.52, 'BRAF', 'missense'),
Variant('chr13', 32914438, 'T', 'C', 38.7, 55, 0.48, 'BRCA2', 'missense'),
Variant('chr17', 41244936, 'G', 'A', 40.3, 60, 0.50, 'BRCA1', 'missense'),
Variant('chr3', 178936091, 'G', 'A', 33.5, 48, 0.43, 'PIK3CA', 'missense'),
Variant('chr9', 133748283, 'T', 'G', 37.9, 52, 0.46, 'ABL1', 'missense'),
Variant('chr12', 25398284, 'C', 'T', 39.4, 58, 0.49, 'KRAS', 'missense'),
]
return variants
def _write_vcf(self, variants: List[Variant], output_file: Path):
"""Write variants to VCF format"""
with open(output_file, 'w') as f:
# VCF header
f.write("##fileformat=VCFv4.2\n")
f.write("##source=CancerAtHomeVariantCaller\n")
f.write("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n")
f.write("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n")
f.write("##INFO=<ID=GENE,Number=1,Type=String,Description=\"Gene Name\">\n")
f.write("##INFO=<ID=CONS,Number=1,Type=String,Description=\"Consequence\">\n")
f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
# Variant records
for v in variants:
info = f"DP={v.depth};AF={v.allele_frequency:.3f}"
if v.gene:
info += f";GENE={v.gene}"
if v.consequence:
info += f";CONS={v.consequence}"
filter_status = "PASS" if v.depth >= self.min_coverage and v.allele_frequency >= self.min_allele_frequency else "LowQual"
f.write(f"{v.chromosome}\t{v.position}\t.\t{v.reference}\t{v.alternate}\t{v.quality:.1f}\t{filter_status}\t{info}\n")
def filter_variants(
self,
vcf_file: Path,
min_quality: float = 30.0
) -> List[Variant]:
"""Filter variants by quality metrics"""
variants = []
try:
with open(vcf_file, 'r') as f:
for line in f:
if line.startswith('#'):
continue
fields = line.strip().split('\t')
if len(fields) < 8:
continue
quality = float(fields[5])
if quality < min_quality:
continue
# Parse INFO field
info = dict(item.split('=') for item in fields[7].split(';') if '=' in item)
variant = Variant(
chromosome=fields[0],
position=int(fields[1]),
reference=fields[3],
alternate=fields[4],
quality=quality,
depth=int(info.get('DP', 0)),
allele_frequency=float(info.get('AF', 0)),
gene=info.get('GENE'),
consequence=info.get('CONS')
)
variants.append(variant)
logger.info(f"Filtered to {len(variants)} high-quality variants")
return variants
except Exception as e:
logger.error(f"Error filtering variants: {e}")
return []
def annotate_variants(self, variants: List[Variant]) -> List[Variant]:
"""
Annotate variants with functional information
In production, integrate with tools like:
- ANNOVAR
- VEP (Variant Effect Predictor)
- SnpEff
"""
# Simulated annotation
for variant in variants:
if not variant.gene:
variant.gene = "UNKNOWN"
if not variant.consequence:
variant.consequence = "unknown"
return variants
class VariantAnalyzer:
"""Analyze and interpret variants"""
def __init__(self):
self.caller = VariantCaller()
def identify_cancer_variants(self, variants: List[Variant]) -> List[Variant]:
"""Identify known cancer-associated variants"""
# Common cancer genes
cancer_genes = {
'TP53', 'BRCA1', 'BRCA2', 'KRAS', 'EGFR', 'BRAF',
'PIK3CA', 'APC', 'PTEN', 'MYC', 'RB1', 'CDKN2A'
}
cancer_variants = [
v for v in variants
if v.gene and v.gene in cancer_genes
]
logger.info(f"Found {len(cancer_variants)} cancer-associated variants")
return cancer_variants
def calculate_mutation_burden(self, variants: List[Variant]) -> float:
"""Calculate tumor mutation burden (TMB)"""
# TMB = number of somatic mutations per megabase
coding_variants = [v for v in variants if v.consequence in ['missense', 'nonsense', 'frameshift']]
# Assume exome size of ~30 Mb
exome_size_mb = 30
tmb = len(coding_variants) / exome_size_mb
logger.info(f"Tumor Mutation Burden: {tmb:.2f} mutations/Mb")
return tmb