"""Default configuration"""
from typing import Literal
from pydantic import BaseModel
[docs]
class Configuration(BaseModel, extra='allow'):
"""Base configuration class."""
def __getitem__(self, idx):
return self.__getattribute__(idx)
[docs]
class TrainingConfiguration(Configuration):
"""Default configuration object for training.
Override and/or create new configurations using the constructor.
Example:
>>> cfg = TrainingConfiguration(batch_size=17, param_a=42)
>>> print(cfg.batch_size) # returns 17 (overrides default)
>>> print(cfg.param_a) # returns 42 (new value)
>>> print(cfg.pin_memory) # returns the default value
>>>
>>> from rich import print
>>> print(cfg) # pretty-print of configuration
"""
#: Batch size. In a distributed environment it is usually the
#: per-worker batch size. Defaults to 32.
batch_size: int = 32
#: Whether to shuffle train dataset when creating a torch ``DataLoader``.
#: Defaults to False.
shuffle_train: bool = False
#: Whether to shuffle validation dataset when creating a torch
#: ``DataLoader``.
#: Defaults to False.
shuffle_validation: bool = False
#: Whether to shuffle test dataset when creating a torch ``DataLoader``.
#: Defaults to False.
shuffle_test: bool = False
#: Whether to pin GPU memory. Property of torch ``DataLoader``.
#: Defaults to False.
pin_gpu_memory: bool = False
#: Number of parallel workers used by torch ``DataLoader``.
#: Defaults to 4.
num_workers_dataloader: int = 4
#: Loss function. Defaults to 'cross_entropy'
loss: Literal[
'mse',
'nllloss',
'cross_entropy'
] = 'cross_entropy'
#: Name of the optimizer to use. Defaults to 'adam'.
optimizer: Literal['adadelta', 'adam', 'rmsprop', 'sgd'] = 'adam'
#: Learning rate used by the optimizer. Defaults to 1e-3.
optim_lr: float = 1e-3
#: Momentum used by some optimizers (e.g., SGD). Defaults to 0.9.
optim_momentum: float = .9
#: Weight decay parameter for the optimizer. Defaults to 0.
optim_weight_decay: float = .0
#: Parameter of Horovod's ``DistributedOptimizer``: uses float16
#: operations in the allreduce
#: distributed gradients aggregation. Better performances at
#: lower precision. Defaults to False.
fp16_allreduce: bool = False
#: Parameter of Horovod's ``DistributedOptimizer``: use Adasum
#: optimization.
#: Defaults to False.
use_adasum: bool = False
#: Parameter of Horovod's ``DistributedOptimizer``: scale
#: gradients before adding them up.
#: Defaults to 1.0.
gradient_predivide_factor: float = 1.0