|
|
"""
|
|
|
Data Importer for Neo4j
|
|
|
Import cancer data from various sources into the graph database
|
|
|
"""
|
|
|
|
|
|
from pathlib import Path
|
|
|
from typing import Dict, List
|
|
|
import logging
|
|
|
from .db_manager import (
|
|
|
DatabaseManager,
|
|
|
GeneRepository,
|
|
|
MutationRepository,
|
|
|
PatientRepository,
|
|
|
CancerTypeRepository
|
|
|
)
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
class DataImporter:
|
|
|
"""Import cancer genomics data into Neo4j"""
|
|
|
|
|
|
def __init__(self):
|
|
|
self.db = DatabaseManager()
|
|
|
self.gene_repo = GeneRepository(self.db)
|
|
|
self.mutation_repo = MutationRepository(self.db)
|
|
|
self.patient_repo = PatientRepository(self.db)
|
|
|
self.cancer_repo = CancerTypeRepository(self.db)
|
|
|
|
|
|
def close(self):
|
|
|
"""Close database connection"""
|
|
|
self.db.close()
|
|
|
|
|
|
def import_sample_data(self):
|
|
|
"""Import sample cancer data for demonstration"""
|
|
|
logger.info("Importing sample cancer data...")
|
|
|
|
|
|
|
|
|
cancer_types = [
|
|
|
{'cancer_type_id': 'BRCA', 'name': 'Breast Cancer', 'tissue': 'Breast', 'disease_type': 'Adenocarcinoma'},
|
|
|
{'cancer_type_id': 'LUAD', 'name': 'Lung Adenocarcinoma', 'tissue': 'Lung', 'disease_type': 'Adenocarcinoma'},
|
|
|
{'cancer_type_id': 'COAD', 'name': 'Colon Adenocarcinoma', 'tissue': 'Colon', 'disease_type': 'Adenocarcinoma'},
|
|
|
{'cancer_type_id': 'GBM', 'name': 'Glioblastoma', 'tissue': 'Brain', 'disease_type': 'Glioblastoma'},
|
|
|
]
|
|
|
|
|
|
for cancer_data in cancer_types:
|
|
|
self.cancer_repo.create_cancer_type(cancer_data)
|
|
|
logger.info(f"Created cancer type: {cancer_data['name']}")
|
|
|
|
|
|
|
|
|
genes = [
|
|
|
{'gene_id': 'ENSG00000141510', 'symbol': 'TP53', 'name': 'Tumor protein p53', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
|
|
|
{'gene_id': 'ENSG00000157764', 'symbol': 'BRAF', 'name': 'B-Raf proto-oncogene', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
|
|
|
{'gene_id': 'ENSG00000139618', 'symbol': 'BRCA2', 'name': 'BRCA2 DNA repair associated', 'chromosome': 'chr13', 'gene_type': 'protein_coding'},
|
|
|
{'gene_id': 'ENSG00000012048', 'symbol': 'BRCA1', 'name': 'BRCA1 DNA repair associated', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
|
|
|
{'gene_id': 'ENSG00000121879', 'symbol': 'PIK3CA', 'name': 'Phosphatidylinositol-4,5-bisphosphate 3-kinase', 'chromosome': 'chr3', 'gene_type': 'protein_coding'},
|
|
|
{'gene_id': 'ENSG00000133703', 'symbol': 'KRAS', 'name': 'KRAS proto-oncogene', 'chromosome': 'chr12', 'gene_type': 'protein_coding'},
|
|
|
{'gene_id': 'ENSG00000146648', 'symbol': 'EGFR', 'name': 'Epidermal growth factor receptor', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
|
|
|
]
|
|
|
|
|
|
for gene_data in genes:
|
|
|
self.gene_repo.create_gene(gene_data)
|
|
|
logger.info(f"Created gene: {gene_data['symbol']}")
|
|
|
|
|
|
|
|
|
patients = [
|
|
|
{'patient_id': 'TCGA-A1-001', 'project_id': 'TCGA-BRCA', 'age': 55, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
|
|
|
{'patient_id': 'TCGA-A1-002', 'project_id': 'TCGA-BRCA', 'age': 62, 'gender': 'female', 'race': 'asian', 'vital_status': 'alive'},
|
|
|
{'patient_id': 'TCGA-L1-001', 'project_id': 'TCGA-LUAD', 'age': 68, 'gender': 'male', 'race': 'white', 'vital_status': 'deceased'},
|
|
|
{'patient_id': 'TCGA-L1-002', 'project_id': 'TCGA-LUAD', 'age': 71, 'gender': 'male', 'race': 'black', 'vital_status': 'alive'},
|
|
|
{'patient_id': 'TCGA-C1-001', 'project_id': 'TCGA-COAD', 'age': 58, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
|
|
|
]
|
|
|
|
|
|
for patient_data in patients:
|
|
|
self.patient_repo.create_patient(patient_data)
|
|
|
logger.info(f"Created patient: {patient_data['patient_id']}")
|
|
|
|
|
|
|
|
|
diagnoses = [
|
|
|
{'patient_id': 'TCGA-A1-001', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
|
|
|
{'patient_id': 'TCGA-A1-002', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage III', 'grade': 'G3'}},
|
|
|
{'patient_id': 'TCGA-L1-001', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage IV', 'grade': 'G3'}},
|
|
|
{'patient_id': 'TCGA-L1-002', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
|
|
|
{'patient_id': 'TCGA-C1-001', 'cancer_type_id': 'COAD', 'properties': {'stage': 'Stage III', 'grade': 'G2'}},
|
|
|
]
|
|
|
|
|
|
for diagnosis in diagnoses:
|
|
|
self.patient_repo.link_patient_to_cancer_type(
|
|
|
diagnosis['patient_id'],
|
|
|
diagnosis['cancer_type_id'],
|
|
|
diagnosis['properties']
|
|
|
)
|
|
|
|
|
|
|
|
|
mutations = [
|
|
|
{'mutation_id': 'MUT-TP53-001', 'chromosome': 'chr17', 'position': 7577538, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 35.2},
|
|
|
{'mutation_id': 'MUT-BRAF-001', 'chromosome': 'chr7', 'position': 140453136, 'reference': 'A', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 42.1},
|
|
|
{'mutation_id': 'MUT-BRCA2-001', 'chromosome': 'chr13', 'position': 32914438, 'reference': 'T', 'alternate': 'C', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 38.7},
|
|
|
{'mutation_id': 'MUT-PIK3CA-001', 'chromosome': 'chr3', 'position': 178936091, 'reference': 'G', 'alternate': 'A', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 33.5},
|
|
|
{'mutation_id': 'MUT-KRAS-001', 'chromosome': 'chr12', 'position': 25398284, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 39.4},
|
|
|
]
|
|
|
|
|
|
gene_mutations = [
|
|
|
('MUT-TP53-001', 'ENSG00000141510'),
|
|
|
('MUT-BRAF-001', 'ENSG00000157764'),
|
|
|
('MUT-BRCA2-001', 'ENSG00000139618'),
|
|
|
('MUT-PIK3CA-001', 'ENSG00000121879'),
|
|
|
('MUT-KRAS-001', 'ENSG00000133703'),
|
|
|
]
|
|
|
|
|
|
for mutation_data, (mut_id, gene_id) in zip(mutations, gene_mutations):
|
|
|
self.mutation_repo.create_mutation(mutation_data, gene_id)
|
|
|
logger.info(f"Created mutation: {mutation_data['mutation_id']}")
|
|
|
|
|
|
|
|
|
patient_mutations = [
|
|
|
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.45, 'depth': 50}},
|
|
|
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-PIK3CA-001', 'properties': {'allele_frequency': 0.38, 'depth': 48}},
|
|
|
{'patient_id': 'TCGA-A1-002', 'mutation_id': 'MUT-BRCA2-001', 'properties': {'allele_frequency': 0.52, 'depth': 55}},
|
|
|
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.49, 'depth': 58}},
|
|
|
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.41, 'depth': 45}},
|
|
|
{'patient_id': 'TCGA-L1-002', 'mutation_id': 'MUT-BRAF-001', 'properties': {'allele_frequency': 0.47, 'depth': 52}},
|
|
|
{'patient_id': 'TCGA-C1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.44, 'depth': 50}},
|
|
|
]
|
|
|
|
|
|
for pm in patient_mutations:
|
|
|
self.mutation_repo.link_mutation_to_patient(
|
|
|
pm['mutation_id'],
|
|
|
pm['patient_id'],
|
|
|
pm['properties']
|
|
|
)
|
|
|
|
|
|
logger.info("Sample data import completed!")
|
|
|
|
|
|
def import_gdc_data(self, gdc_files: List[Dict]):
|
|
|
"""Import data from GDC portal"""
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
def initialize_database():
|
|
|
"""Initialize database with sample data"""
|
|
|
importer = DataImporter()
|
|
|
try:
|
|
|
importer.import_sample_data()
|
|
|
finally:
|
|
|
importer.close()
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
initialize_database()
|
|
|
|