File size: 1,641 Bytes
f986893
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""

The vocabulary building scripts.

"""
import os

from grover.data.torchvocab import MolVocab


def build():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default="../../dataset/grover_new_dataset/druglike_merged_refine2.csv", type=str)
    parser.add_argument('--vocab_save_folder', default="../../dataset/grover_new_dataset", type=str)
    parser.add_argument('--dataset_name', type=str, default=None,
                        help="Will be the first part of the vocab file name. If it is None,"
                             "the vocab files will be: atom_vocab.pkl and bond_vocab.pkl")
    parser.add_argument('--vocab_max_size', type=int, default=None)
    parser.add_argument('--vocab_min_freq', type=int, default=1)
    args = parser.parse_args()

    # fin = open(args.data_path, 'r')
    # lines = fin.readlines()

    for vocab_type in ['atom', 'bond']:
        vocab_file = f"{vocab_type}_vocab.pkl"
        if args.dataset_name is not None:
            vocab_file = args.dataset_name + '_' + vocab_file
        vocab_save_path = os.path.join(args.vocab_save_folder, vocab_file)

        os.makedirs(os.path.dirname(vocab_save_path), exist_ok=True)
        vocab = MolVocab(file_path=args.data_path,
                         max_size=args.vocab_max_size,
                         min_freq=args.vocab_min_freq,
                         num_workers=100,
                         vocab_type=vocab_type)
        print(f"{vocab_type} vocab size", len(vocab))
        vocab.save_vocab(vocab_save_path)


if __name__ == '__main__':
    build()