[docs]@dataclasses.dataclassclassResumeResultsConfig:"""Configuration for resuming a previously stopped or finished job. Typically only useful for training jobs which have already finished (e.g., to train for a larger value of max_epochs than originally configured) or which were stopped (e.g., to resume training on different hardware or to change data loader settings such as number of data workers). WARNING: We typically don't guarantee backwards compatibility for training, so this may not work well when resuming old experiments. Arguments: existing_dir: Directory with existing results to resume from. resume_wandb: If true, log to the same WandB job as given in the wandb_job_id file in existing_dir, if any. """existing_dir:strresume_wandb:bool=False
[docs]defprepare_directory(self,experiment_dir:str):"""Recursively copies existing_dir to experiment_dir. Arguments: experiment_dir: Directory to which existing_dir will be copied. Typically, this will be an empty directory which has been configured for saving a training job's outputs, such as model checkpoints. """ifnotos.path.isdir(self.existing_dir):raiseValueError(f"The directory {self.existing_dir} does not exist.")dist=Distributed.get_instance()ifdist.is_root():# recursively copy all files in existing_dir to experiment_dirshutil.copytree(self.existing_dir,experiment_dir,dirs_exist_ok=True)wandb_run_id_path=os.path.join(experiment_dir,WANDB_RUN_ID_FILE)ifnotself.resume_wandbandos.path.exists(wandb_run_id_path):os.remove(wandb_run_id_path)
defverify_wandb_resumption(self,experiment_dir:str):wandb=WandB.get_instance()ifself.resume_wandbandwandb.enabled:withopen(os.path.join(experiment_dir,WANDB_RUN_ID_FILE))asf:wandb_run_id=f.read().strip()ifwandb.get_id()!=wandb_run_id:raiseValueError(f"Expected WandB job ID for resumption is {wandb_run_id} "f"but the actual ID is {wandb.get_id()}. ""Is there a bug in ResumeResultsConfig?")
defprepare_config(path:str,override:Sequence[str]|None=None)->dict:"""Get config and update with possible dotlist override."""withopen(path)asf:data=yaml.safe_load(f)data=update_dict_with_dotlist(data,override)returndatadefprepare_directory(path:str,config_data:dict,resume_results:ResumeResultsConfig|None=None)->ResumeResultsConfig|None:"""Create experiment directory and dump config_data to it."""dist=Distributed.get_instance()ifnotos.path.isdir(path)anddist.is_root():os.makedirs(path,exist_ok=True)ifresume_resultsisnotNoneandnotos.path.isdir(os.path.join(path,"training_checkpoints")):resume_results.prepare_directory(path)else:# either not given or ignored because we already resumed once beforeresume_results=Nonewithopen(os.path.join(path,"config.yaml"),"w")asf:yaml.dump(config_data,f,default_flow_style=False,sort_keys=False)returnresume_resultsdefget_parser():"""Standard arg parser for ACE entrypoints."""parser=argparse.ArgumentParser()parser.add_argument("yaml_config",type=str,help="Path to the YAML config file.")parser.add_argument("--override",nargs="*",help="A dotlist of key=value pairs to override the config. ""For example, --override a.b=1 c=2, where a dot indicates nesting.",)returnparser