""" Data Importer for Neo4j Import cancer data from various sources into the graph database """ from pathlib import Path from typing import Dict, List import logging from .db_manager import ( DatabaseManager, GeneRepository, MutationRepository, PatientRepository, CancerTypeRepository ) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DataImporter: """Import cancer genomics data into Neo4j""" def __init__(self): self.db = DatabaseManager() self.gene_repo = GeneRepository(self.db) self.mutation_repo = MutationRepository(self.db) self.patient_repo = PatientRepository(self.db) self.cancer_repo = CancerTypeRepository(self.db) def close(self): """Close database connection""" self.db.close() def import_sample_data(self): """Import sample cancer data for demonstration""" logger.info("Importing sample cancer data...") # Create cancer types cancer_types = [ {'cancer_type_id': 'BRCA', 'name': 'Breast Cancer', 'tissue': 'Breast', 'disease_type': 'Adenocarcinoma'}, {'cancer_type_id': 'LUAD', 'name': 'Lung Adenocarcinoma', 'tissue': 'Lung', 'disease_type': 'Adenocarcinoma'}, {'cancer_type_id': 'COAD', 'name': 'Colon Adenocarcinoma', 'tissue': 'Colon', 'disease_type': 'Adenocarcinoma'}, {'cancer_type_id': 'GBM', 'name': 'Glioblastoma', 'tissue': 'Brain', 'disease_type': 'Glioblastoma'}, ] for cancer_data in cancer_types: self.cancer_repo.create_cancer_type(cancer_data) logger.info(f"Created cancer type: {cancer_data['name']}") # Create genes genes = [ {'gene_id': 'ENSG00000141510', 'symbol': 'TP53', 'name': 'Tumor protein p53', 'chromosome': 'chr17', 'gene_type': 'protein_coding'}, {'gene_id': 'ENSG00000157764', 'symbol': 'BRAF', 'name': 'B-Raf proto-oncogene', 'chromosome': 'chr7', 'gene_type': 'protein_coding'}, {'gene_id': 'ENSG00000139618', 'symbol': 'BRCA2', 'name': 'BRCA2 DNA repair associated', 'chromosome': 'chr13', 'gene_type': 'protein_coding'}, {'gene_id': 'ENSG00000012048', 'symbol': 'BRCA1', 'name': 'BRCA1 DNA repair associated', 'chromosome': 'chr17', 'gene_type': 'protein_coding'}, {'gene_id': 'ENSG00000121879', 'symbol': 'PIK3CA', 'name': 'Phosphatidylinositol-4,5-bisphosphate 3-kinase', 'chromosome': 'chr3', 'gene_type': 'protein_coding'}, {'gene_id': 'ENSG00000133703', 'symbol': 'KRAS', 'name': 'KRAS proto-oncogene', 'chromosome': 'chr12', 'gene_type': 'protein_coding'}, {'gene_id': 'ENSG00000146648', 'symbol': 'EGFR', 'name': 'Epidermal growth factor receptor', 'chromosome': 'chr7', 'gene_type': 'protein_coding'}, ] for gene_data in genes: self.gene_repo.create_gene(gene_data) logger.info(f"Created gene: {gene_data['symbol']}") # Create patients patients = [ {'patient_id': 'TCGA-A1-001', 'project_id': 'TCGA-BRCA', 'age': 55, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'}, {'patient_id': 'TCGA-A1-002', 'project_id': 'TCGA-BRCA', 'age': 62, 'gender': 'female', 'race': 'asian', 'vital_status': 'alive'}, {'patient_id': 'TCGA-L1-001', 'project_id': 'TCGA-LUAD', 'age': 68, 'gender': 'male', 'race': 'white', 'vital_status': 'deceased'}, {'patient_id': 'TCGA-L1-002', 'project_id': 'TCGA-LUAD', 'age': 71, 'gender': 'male', 'race': 'black', 'vital_status': 'alive'}, {'patient_id': 'TCGA-C1-001', 'project_id': 'TCGA-COAD', 'age': 58, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'}, ] for patient_data in patients: self.patient_repo.create_patient(patient_data) logger.info(f"Created patient: {patient_data['patient_id']}") # Link patients to cancer types diagnoses = [ {'patient_id': 'TCGA-A1-001', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage II', 'grade': 'G2'}}, {'patient_id': 'TCGA-A1-002', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage III', 'grade': 'G3'}}, {'patient_id': 'TCGA-L1-001', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage IV', 'grade': 'G3'}}, {'patient_id': 'TCGA-L1-002', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage II', 'grade': 'G2'}}, {'patient_id': 'TCGA-C1-001', 'cancer_type_id': 'COAD', 'properties': {'stage': 'Stage III', 'grade': 'G2'}}, ] for diagnosis in diagnoses: self.patient_repo.link_patient_to_cancer_type( diagnosis['patient_id'], diagnosis['cancer_type_id'], diagnosis['properties'] ) # Create mutations mutations = [ {'mutation_id': 'MUT-TP53-001', 'chromosome': 'chr17', 'position': 7577538, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 35.2}, {'mutation_id': 'MUT-BRAF-001', 'chromosome': 'chr7', 'position': 140453136, 'reference': 'A', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 42.1}, {'mutation_id': 'MUT-BRCA2-001', 'chromosome': 'chr13', 'position': 32914438, 'reference': 'T', 'alternate': 'C', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 38.7}, {'mutation_id': 'MUT-PIK3CA-001', 'chromosome': 'chr3', 'position': 178936091, 'reference': 'G', 'alternate': 'A', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 33.5}, {'mutation_id': 'MUT-KRAS-001', 'chromosome': 'chr12', 'position': 25398284, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 39.4}, ] gene_mutations = [ ('MUT-TP53-001', 'ENSG00000141510'), ('MUT-BRAF-001', 'ENSG00000157764'), ('MUT-BRCA2-001', 'ENSG00000139618'), ('MUT-PIK3CA-001', 'ENSG00000121879'), ('MUT-KRAS-001', 'ENSG00000133703'), ] for mutation_data, (mut_id, gene_id) in zip(mutations, gene_mutations): self.mutation_repo.create_mutation(mutation_data, gene_id) logger.info(f"Created mutation: {mutation_data['mutation_id']}") # Link mutations to patients patient_mutations = [ {'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.45, 'depth': 50}}, {'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-PIK3CA-001', 'properties': {'allele_frequency': 0.38, 'depth': 48}}, {'patient_id': 'TCGA-A1-002', 'mutation_id': 'MUT-BRCA2-001', 'properties': {'allele_frequency': 0.52, 'depth': 55}}, {'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.49, 'depth': 58}}, {'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.41, 'depth': 45}}, {'patient_id': 'TCGA-L1-002', 'mutation_id': 'MUT-BRAF-001', 'properties': {'allele_frequency': 0.47, 'depth': 52}}, {'patient_id': 'TCGA-C1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.44, 'depth': 50}}, ] for pm in patient_mutations: self.mutation_repo.link_mutation_to_patient( pm['mutation_id'], pm['patient_id'], pm['properties'] ) logger.info("Sample data import completed!") def import_gdc_data(self, gdc_files: List[Dict]): """Import data from GDC portal""" # Implementation for importing real GDC data pass def initialize_database(): """Initialize database with sample data""" importer = DataImporter() try: importer.import_sample_data() finally: importer.close() if __name__ == "__main__": initialize_database()