File size: 8,315 Bytes
7a92197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""

Data Importer for Neo4j

Import cancer data from various sources into the graph database

"""

from pathlib import Path
from typing import Dict, List
import logging
from .db_manager import (
    DatabaseManager,
    GeneRepository,
    MutationRepository,
    PatientRepository,
    CancerTypeRepository
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class DataImporter:
    """Import cancer genomics data into Neo4j"""
    
    def __init__(self):
        self.db = DatabaseManager()
        self.gene_repo = GeneRepository(self.db)
        self.mutation_repo = MutationRepository(self.db)
        self.patient_repo = PatientRepository(self.db)
        self.cancer_repo = CancerTypeRepository(self.db)
    
    def close(self):
        """Close database connection"""
        self.db.close()
    
    def import_sample_data(self):
        """Import sample cancer data for demonstration"""
        logger.info("Importing sample cancer data...")
        
        # Create cancer types
        cancer_types = [
            {'cancer_type_id': 'BRCA', 'name': 'Breast Cancer', 'tissue': 'Breast', 'disease_type': 'Adenocarcinoma'},
            {'cancer_type_id': 'LUAD', 'name': 'Lung Adenocarcinoma', 'tissue': 'Lung', 'disease_type': 'Adenocarcinoma'},
            {'cancer_type_id': 'COAD', 'name': 'Colon Adenocarcinoma', 'tissue': 'Colon', 'disease_type': 'Adenocarcinoma'},
            {'cancer_type_id': 'GBM', 'name': 'Glioblastoma', 'tissue': 'Brain', 'disease_type': 'Glioblastoma'},
        ]
        
        for cancer_data in cancer_types:
            self.cancer_repo.create_cancer_type(cancer_data)
            logger.info(f"Created cancer type: {cancer_data['name']}")
        
        # Create genes
        genes = [
            {'gene_id': 'ENSG00000141510', 'symbol': 'TP53', 'name': 'Tumor protein p53', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
            {'gene_id': 'ENSG00000157764', 'symbol': 'BRAF', 'name': 'B-Raf proto-oncogene', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
            {'gene_id': 'ENSG00000139618', 'symbol': 'BRCA2', 'name': 'BRCA2 DNA repair associated', 'chromosome': 'chr13', 'gene_type': 'protein_coding'},
            {'gene_id': 'ENSG00000012048', 'symbol': 'BRCA1', 'name': 'BRCA1 DNA repair associated', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
            {'gene_id': 'ENSG00000121879', 'symbol': 'PIK3CA', 'name': 'Phosphatidylinositol-4,5-bisphosphate 3-kinase', 'chromosome': 'chr3', 'gene_type': 'protein_coding'},
            {'gene_id': 'ENSG00000133703', 'symbol': 'KRAS', 'name': 'KRAS proto-oncogene', 'chromosome': 'chr12', 'gene_type': 'protein_coding'},
            {'gene_id': 'ENSG00000146648', 'symbol': 'EGFR', 'name': 'Epidermal growth factor receptor', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
        ]
        
        for gene_data in genes:
            self.gene_repo.create_gene(gene_data)
            logger.info(f"Created gene: {gene_data['symbol']}")
        
        # Create patients
        patients = [
            {'patient_id': 'TCGA-A1-001', 'project_id': 'TCGA-BRCA', 'age': 55, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
            {'patient_id': 'TCGA-A1-002', 'project_id': 'TCGA-BRCA', 'age': 62, 'gender': 'female', 'race': 'asian', 'vital_status': 'alive'},
            {'patient_id': 'TCGA-L1-001', 'project_id': 'TCGA-LUAD', 'age': 68, 'gender': 'male', 'race': 'white', 'vital_status': 'deceased'},
            {'patient_id': 'TCGA-L1-002', 'project_id': 'TCGA-LUAD', 'age': 71, 'gender': 'male', 'race': 'black', 'vital_status': 'alive'},
            {'patient_id': 'TCGA-C1-001', 'project_id': 'TCGA-COAD', 'age': 58, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
        ]
        
        for patient_data in patients:
            self.patient_repo.create_patient(patient_data)
            logger.info(f"Created patient: {patient_data['patient_id']}")
        
        # Link patients to cancer types
        diagnoses = [
            {'patient_id': 'TCGA-A1-001', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
            {'patient_id': 'TCGA-A1-002', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage III', 'grade': 'G3'}},
            {'patient_id': 'TCGA-L1-001', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage IV', 'grade': 'G3'}},
            {'patient_id': 'TCGA-L1-002', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
            {'patient_id': 'TCGA-C1-001', 'cancer_type_id': 'COAD', 'properties': {'stage': 'Stage III', 'grade': 'G2'}},
        ]
        
        for diagnosis in diagnoses:
            self.patient_repo.link_patient_to_cancer_type(
                diagnosis['patient_id'],
                diagnosis['cancer_type_id'],
                diagnosis['properties']
            )
        
        # Create mutations
        mutations = [
            {'mutation_id': 'MUT-TP53-001', 'chromosome': 'chr17', 'position': 7577538, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 35.2},
            {'mutation_id': 'MUT-BRAF-001', 'chromosome': 'chr7', 'position': 140453136, 'reference': 'A', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 42.1},
            {'mutation_id': 'MUT-BRCA2-001', 'chromosome': 'chr13', 'position': 32914438, 'reference': 'T', 'alternate': 'C', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 38.7},
            {'mutation_id': 'MUT-PIK3CA-001', 'chromosome': 'chr3', 'position': 178936091, 'reference': 'G', 'alternate': 'A', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 33.5},
            {'mutation_id': 'MUT-KRAS-001', 'chromosome': 'chr12', 'position': 25398284, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 39.4},
        ]
        
        gene_mutations = [
            ('MUT-TP53-001', 'ENSG00000141510'),
            ('MUT-BRAF-001', 'ENSG00000157764'),
            ('MUT-BRCA2-001', 'ENSG00000139618'),
            ('MUT-PIK3CA-001', 'ENSG00000121879'),
            ('MUT-KRAS-001', 'ENSG00000133703'),
        ]
        
        for mutation_data, (mut_id, gene_id) in zip(mutations, gene_mutations):
            self.mutation_repo.create_mutation(mutation_data, gene_id)
            logger.info(f"Created mutation: {mutation_data['mutation_id']}")
        
        # Link mutations to patients
        patient_mutations = [
            {'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.45, 'depth': 50}},
            {'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-PIK3CA-001', 'properties': {'allele_frequency': 0.38, 'depth': 48}},
            {'patient_id': 'TCGA-A1-002', 'mutation_id': 'MUT-BRCA2-001', 'properties': {'allele_frequency': 0.52, 'depth': 55}},
            {'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.49, 'depth': 58}},
            {'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.41, 'depth': 45}},
            {'patient_id': 'TCGA-L1-002', 'mutation_id': 'MUT-BRAF-001', 'properties': {'allele_frequency': 0.47, 'depth': 52}},
            {'patient_id': 'TCGA-C1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.44, 'depth': 50}},
        ]
        
        for pm in patient_mutations:
            self.mutation_repo.link_mutation_to_patient(
                pm['mutation_id'],
                pm['patient_id'],
                pm['properties']
            )
        
        logger.info("Sample data import completed!")
    
    def import_gdc_data(self, gdc_files: List[Dict]):
        """Import data from GDC portal"""
        # Implementation for importing real GDC data
        pass


def initialize_database():
    """Initialize database with sample data"""
    importer = DataImporter()
    try:
        importer.import_sample_data()
    finally:
        importer.close()


if __name__ == "__main__":
    initialize_database()