File size: 8,315 Bytes
7a92197 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
"""
Data Importer for Neo4j
Import cancer data from various sources into the graph database
"""
from pathlib import Path
from typing import Dict, List
import logging
from .db_manager import (
DatabaseManager,
GeneRepository,
MutationRepository,
PatientRepository,
CancerTypeRepository
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DataImporter:
"""Import cancer genomics data into Neo4j"""
def __init__(self):
self.db = DatabaseManager()
self.gene_repo = GeneRepository(self.db)
self.mutation_repo = MutationRepository(self.db)
self.patient_repo = PatientRepository(self.db)
self.cancer_repo = CancerTypeRepository(self.db)
def close(self):
"""Close database connection"""
self.db.close()
def import_sample_data(self):
"""Import sample cancer data for demonstration"""
logger.info("Importing sample cancer data...")
# Create cancer types
cancer_types = [
{'cancer_type_id': 'BRCA', 'name': 'Breast Cancer', 'tissue': 'Breast', 'disease_type': 'Adenocarcinoma'},
{'cancer_type_id': 'LUAD', 'name': 'Lung Adenocarcinoma', 'tissue': 'Lung', 'disease_type': 'Adenocarcinoma'},
{'cancer_type_id': 'COAD', 'name': 'Colon Adenocarcinoma', 'tissue': 'Colon', 'disease_type': 'Adenocarcinoma'},
{'cancer_type_id': 'GBM', 'name': 'Glioblastoma', 'tissue': 'Brain', 'disease_type': 'Glioblastoma'},
]
for cancer_data in cancer_types:
self.cancer_repo.create_cancer_type(cancer_data)
logger.info(f"Created cancer type: {cancer_data['name']}")
# Create genes
genes = [
{'gene_id': 'ENSG00000141510', 'symbol': 'TP53', 'name': 'Tumor protein p53', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000157764', 'symbol': 'BRAF', 'name': 'B-Raf proto-oncogene', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000139618', 'symbol': 'BRCA2', 'name': 'BRCA2 DNA repair associated', 'chromosome': 'chr13', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000012048', 'symbol': 'BRCA1', 'name': 'BRCA1 DNA repair associated', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000121879', 'symbol': 'PIK3CA', 'name': 'Phosphatidylinositol-4,5-bisphosphate 3-kinase', 'chromosome': 'chr3', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000133703', 'symbol': 'KRAS', 'name': 'KRAS proto-oncogene', 'chromosome': 'chr12', 'gene_type': 'protein_coding'},
{'gene_id': 'ENSG00000146648', 'symbol': 'EGFR', 'name': 'Epidermal growth factor receptor', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
]
for gene_data in genes:
self.gene_repo.create_gene(gene_data)
logger.info(f"Created gene: {gene_data['symbol']}")
# Create patients
patients = [
{'patient_id': 'TCGA-A1-001', 'project_id': 'TCGA-BRCA', 'age': 55, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
{'patient_id': 'TCGA-A1-002', 'project_id': 'TCGA-BRCA', 'age': 62, 'gender': 'female', 'race': 'asian', 'vital_status': 'alive'},
{'patient_id': 'TCGA-L1-001', 'project_id': 'TCGA-LUAD', 'age': 68, 'gender': 'male', 'race': 'white', 'vital_status': 'deceased'},
{'patient_id': 'TCGA-L1-002', 'project_id': 'TCGA-LUAD', 'age': 71, 'gender': 'male', 'race': 'black', 'vital_status': 'alive'},
{'patient_id': 'TCGA-C1-001', 'project_id': 'TCGA-COAD', 'age': 58, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
]
for patient_data in patients:
self.patient_repo.create_patient(patient_data)
logger.info(f"Created patient: {patient_data['patient_id']}")
# Link patients to cancer types
diagnoses = [
{'patient_id': 'TCGA-A1-001', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
{'patient_id': 'TCGA-A1-002', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage III', 'grade': 'G3'}},
{'patient_id': 'TCGA-L1-001', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage IV', 'grade': 'G3'}},
{'patient_id': 'TCGA-L1-002', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
{'patient_id': 'TCGA-C1-001', 'cancer_type_id': 'COAD', 'properties': {'stage': 'Stage III', 'grade': 'G2'}},
]
for diagnosis in diagnoses:
self.patient_repo.link_patient_to_cancer_type(
diagnosis['patient_id'],
diagnosis['cancer_type_id'],
diagnosis['properties']
)
# Create mutations
mutations = [
{'mutation_id': 'MUT-TP53-001', 'chromosome': 'chr17', 'position': 7577538, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 35.2},
{'mutation_id': 'MUT-BRAF-001', 'chromosome': 'chr7', 'position': 140453136, 'reference': 'A', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 42.1},
{'mutation_id': 'MUT-BRCA2-001', 'chromosome': 'chr13', 'position': 32914438, 'reference': 'T', 'alternate': 'C', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 38.7},
{'mutation_id': 'MUT-PIK3CA-001', 'chromosome': 'chr3', 'position': 178936091, 'reference': 'G', 'alternate': 'A', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 33.5},
{'mutation_id': 'MUT-KRAS-001', 'chromosome': 'chr12', 'position': 25398284, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 39.4},
]
gene_mutations = [
('MUT-TP53-001', 'ENSG00000141510'),
('MUT-BRAF-001', 'ENSG00000157764'),
('MUT-BRCA2-001', 'ENSG00000139618'),
('MUT-PIK3CA-001', 'ENSG00000121879'),
('MUT-KRAS-001', 'ENSG00000133703'),
]
for mutation_data, (mut_id, gene_id) in zip(mutations, gene_mutations):
self.mutation_repo.create_mutation(mutation_data, gene_id)
logger.info(f"Created mutation: {mutation_data['mutation_id']}")
# Link mutations to patients
patient_mutations = [
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.45, 'depth': 50}},
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-PIK3CA-001', 'properties': {'allele_frequency': 0.38, 'depth': 48}},
{'patient_id': 'TCGA-A1-002', 'mutation_id': 'MUT-BRCA2-001', 'properties': {'allele_frequency': 0.52, 'depth': 55}},
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.49, 'depth': 58}},
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.41, 'depth': 45}},
{'patient_id': 'TCGA-L1-002', 'mutation_id': 'MUT-BRAF-001', 'properties': {'allele_frequency': 0.47, 'depth': 52}},
{'patient_id': 'TCGA-C1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.44, 'depth': 50}},
]
for pm in patient_mutations:
self.mutation_repo.link_mutation_to_patient(
pm['mutation_id'],
pm['patient_id'],
pm['properties']
)
logger.info("Sample data import completed!")
def import_gdc_data(self, gdc_files: List[Dict]):
"""Import data from GDC portal"""
# Implementation for importing real GDC data
pass
def initialize_database():
"""Initialize database with sample data"""
importer = DataImporter()
try:
importer.import_sample_data()
finally:
importer.close()
if __name__ == "__main__":
initialize_database()
|