CancerAtHomeV2 / backend /neo4j /data_importer.py
Mentors4EDU's picture
Upload 33 files
7a92197 verified
"""
Data Importer for Neo4j
Import cancer data from various sources into the graph database
"""
from pathlib import Path
from typing import Dict, List
import logging
from .db_manager import (
DatabaseManager,
GeneRepository,
MutationRepository,
PatientRepository,
CancerTypeRepository
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DataImporter:
"""Import cancer genomics data into Neo4j"""
def __init__(self):
self.db = DatabaseManager()
self.gene_repo = GeneRepository(self.db)
self.mutation_repo = MutationRepository(self.db)
self.patient_repo = PatientRepository(self.db)
self.cancer_repo = CancerTypeRepository(self.db)
def close(self):
"""Close database connection"""
self.db.close()
def import_sample_data(self):
"""Import sample cancer data for demonstration"""
logger.info("Importing sample cancer data...")
# Create cancer types
cancer_types = [
{'cancer_type_id': 'BRCA', 'name': 'Breast Cancer', 'tissue': 'Breast', 'disease_type': 'Adenocarcinoma'},
{'cancer_type_id': 'LUAD', 'name': 'Lung Adenocarcinoma', 'tissue': 'Lung', 'disease_type': 'Adenocarcinoma'},
{'cancer_type_id': 'COAD', 'name': 'Colon Adenocarcinoma', 'tissue': 'Colon', 'disease_type': 'Adenocarcinoma'},
{'cancer_type_id': 'GBM', 'name': 'Glioblastoma', 'tissue': 'Brain', 'disease_type': 'Glioblastoma'},
]
for cancer_data in cancer_types:
self.cancer_repo.create_cancer_type(cancer_data)
logger.info(f"Created cancer type: {cancer_data['name']}")
# Create genes
genes = [
{'gene_id': 'ENSG00000141510', 'symbol': 'TP53', 'name': 'Tumor protein p53', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000157764', 'symbol': 'BRAF', 'name': 'B-Raf proto-oncogene', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000139618', 'symbol': 'BRCA2', 'name': 'BRCA2 DNA repair associated', 'chromosome': 'chr13', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000012048', 'symbol': 'BRCA1', 'name': 'BRCA1 DNA repair associated', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000121879', 'symbol': 'PIK3CA', 'name': 'Phosphatidylinositol-4,5-bisphosphate 3-kinase', 'chromosome': 'chr3', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000133703', 'symbol': 'KRAS', 'name': 'KRAS proto-oncogene', 'chromosome': 'chr12', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000146648', 'symbol': 'EGFR', 'name': 'Epidermal growth factor receptor', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
]
for gene_data in genes:
self.gene_repo.create_gene(gene_data)
logger.info(f"Created gene: {gene_data['symbol']}")
# Create patients
patients = [
{'patient_id': 'TCGA-A1-001', 'project_id': 'TCGA-BRCA', 'age': 55, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
{'patient_id': 'TCGA-A1-002', 'project_id': 'TCGA-BRCA', 'age': 62, 'gender': 'female', 'race': 'asian', 'vital_status': 'alive'},
{'patient_id': 'TCGA-L1-001', 'project_id': 'TCGA-LUAD', 'age': 68, 'gender': 'male', 'race': 'white', 'vital_status': 'deceased'},
{'patient_id': 'TCGA-L1-002', 'project_id': 'TCGA-LUAD', 'age': 71, 'gender': 'male', 'race': 'black', 'vital_status': 'alive'},
{'patient_id': 'TCGA-C1-001', 'project_id': 'TCGA-COAD', 'age': 58, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
]
for patient_data in patients:
self.patient_repo.create_patient(patient_data)
logger.info(f"Created patient: {patient_data['patient_id']}")
# Link patients to cancer types
diagnoses = [
{'patient_id': 'TCGA-A1-001', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
{'patient_id': 'TCGA-A1-002', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage III', 'grade': 'G3'}},
{'patient_id': 'TCGA-L1-001', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage IV', 'grade': 'G3'}},
{'patient_id': 'TCGA-L1-002', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
{'patient_id': 'TCGA-C1-001', 'cancer_type_id': 'COAD', 'properties': {'stage': 'Stage III', 'grade': 'G2'}},
]
for diagnosis in diagnoses:
self.patient_repo.link_patient_to_cancer_type(
diagnosis['patient_id'],
diagnosis['cancer_type_id'],
diagnosis['properties']
)
# Create mutations
mutations = [
{'mutation_id': 'MUT-TP53-001', 'chromosome': 'chr17', 'position': 7577538, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 35.2},
{'mutation_id': 'MUT-BRAF-001', 'chromosome': 'chr7', 'position': 140453136, 'reference': 'A', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 42.1},
{'mutation_id': 'MUT-BRCA2-001', 'chromosome': 'chr13', 'position': 32914438, 'reference': 'T', 'alternate': 'C', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 38.7},
{'mutation_id': 'MUT-PIK3CA-001', 'chromosome': 'chr3', 'position': 178936091, 'reference': 'G', 'alternate': 'A', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 33.5},
{'mutation_id': 'MUT-KRAS-001', 'chromosome': 'chr12', 'position': 25398284, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 39.4},
]
gene_mutations = [
('MUT-TP53-001', 'ENSG00000141510'),
('MUT-BRAF-001', 'ENSG00000157764'),
('MUT-BRCA2-001', 'ENSG00000139618'),
('MUT-PIK3CA-001', 'ENSG00000121879'),
('MUT-KRAS-001', 'ENSG00000133703'),
]
for mutation_data, (mut_id, gene_id) in zip(mutations, gene_mutations):
self.mutation_repo.create_mutation(mutation_data, gene_id)
logger.info(f"Created mutation: {mutation_data['mutation_id']}")
# Link mutations to patients
patient_mutations = [
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.45, 'depth': 50}},
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-PIK3CA-001', 'properties': {'allele_frequency': 0.38, 'depth': 48}},
{'patient_id': 'TCGA-A1-002', 'mutation_id': 'MUT-BRCA2-001', 'properties': {'allele_frequency': 0.52, 'depth': 55}},
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.49, 'depth': 58}},
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.41, 'depth': 45}},
{'patient_id': 'TCGA-L1-002', 'mutation_id': 'MUT-BRAF-001', 'properties': {'allele_frequency': 0.47, 'depth': 52}},
{'patient_id': 'TCGA-C1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.44, 'depth': 50}},
]
for pm in patient_mutations:
self.mutation_repo.link_mutation_to_patient(
pm['mutation_id'],
pm['patient_id'],
pm['properties']
)
logger.info("Sample data import completed!")
def import_gdc_data(self, gdc_files: List[Dict]):
"""Import data from GDC portal"""
# Implementation for importing real GDC data
pass
def initialize_database():
"""Initialize database with sample data"""
importer = DataImporter()
try:
importer.import_sample_data()
finally:
importer.close()
if __name__ == "__main__":
initialize_database()