File size: 5,738 Bytes
f986893
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""

The evaluation function.

"""
from argparse import Namespace
from logging import Logger
from typing import List

import numpy as np
import torch
import torch.utils.data.distributed

from grover.data.scaler import StandardScaler
from grover.util.utils import get_class_sizes, get_data, split_data, get_task_names, get_loss_func
from grover.util.utils import load_checkpoint
from task.predict import evaluate_predictions
from grover.util.metrics import get_metric_func
from grover.util.nn_utils import param_count
from task.predict import predict


def run_evaluation(args: Namespace, logger: Logger = None) -> List[float]:
    """

    Trains a model and returns test scores on the model checkpoint with the highest validation score.



    :param args: Arguments.

    :param logger: Logger.

    :return: A list of ensemble scores for each task.

    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    torch.cuda.set_device(0)

    # Get data
    debug('Loading data')
    args.task_names = get_task_names(args.data_path)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Split data
    debug(f'Splitting data with seed {args.seed}')

    train_data, val_data, test_data = split_data(data=data,
                                                 split_type=args.split_type,
                                                 sizes=[0.8, 0.1, 0.1],
                                                 seed=args.seed,
                                                 args=args,
                                                 logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(f'{args.task_names[i]} '
                  f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}')

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data)

    debug(f'Total size = {len(data):,} | '
          f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}')

    # Initialize scaler  (regression only)
    scaler = None
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        _, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)

        val_targets = val_data.targets()
        scaled_val_targets = scaler.transform(val_targets).tolist()
        val_data.set_targets(scaled_val_targets)

    metric_func = get_metric_func(metric=args.metric)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Load/build model
    if args.checkpoint_paths is not None:
        cur_model = args.seed
        target_path = []
        for path in args.checkpoint_paths:
            if "fold_%d" % cur_model in path:
                target_path = path
        debug(f'Loading model {args.seed} from {target_path}')
        model = load_checkpoint(target_path, current_args=args, cuda=args.cuda, logger=logger)
        # Get loss and metric functions
        loss_func = get_loss_func(args, model)

    debug(f'Number of parameters = {param_count(model):,}')

    test_preds, _ = predict(
        model=model,
        data=test_data,
        batch_size=args.batch_size,
        loss_func=loss_func,
        logger=logger,
        shared_dict={},
        scaler=scaler,
        args=args
    )

    test_scores = evaluate_predictions(
        preds=test_preds,
        targets=test_targets,
        num_tasks=args.num_tasks,
        metric_func=metric_func,
        dataset_type=args.dataset_type,
        logger=logger
    )

    if len(test_preds) != 0:
        sum_test_preds += np.array(test_preds, dtype=float)

    # Average test score
    avg_test_score = np.nanmean(test_scores)
    info(f'Model test {args.metric} = {avg_test_score:.6f}')

    if args.show_individual_scores:
        # Individual test scores
        for task_name, test_score in zip(args.task_names, test_scores):
            info(f'Model test {task_name} {args.metric} = {test_score:.6f}')

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(
        preds=avg_test_preds,
        targets=test_targets,
        num_tasks=args.num_tasks,
        metric_func=metric_func,
        dataset_type=args.dataset_type,
        logger=logger
    )

    # If you want to save the prediction result, uncomment these lines.
    # ind = [['preds'] * args.num_tasks + ['targets'] * args.num_tasks, args.task_names * 2]
    # ind = pd.MultiIndex.from_tuples(list(zip(*ind)))
    # data = np.concatenate([np.array(avg_test_preds), np.array(test_targets)], 1)
    # test_result = pd.DataFrame(data, index=test_smiles, columns=ind)
    # test_result.to_csv(os.path.join(args.save_dir, 'test_result.csv'))

    return ensemble_scores