Describe the bug
Hi, @espnet team thanks for amazing work. I am running librispeech recipe with distributed mode using slurm on esonet2. i am running on two oracle instance each one has single gpu (Tesla V100). but when i ran stage 11 it created jobs on both machine and gpu memory is also utlized but it failed after sometime.
Basic environments:
- OS information: Ubuntu 18.04 x86_64
- python version: python 3.9 [GCC 7.3.0]]
- espnet version: latest
- pytorch version 1.12.0
- cuda 10.2
Task information:
- Task: ASR
- Recipe: librispeech
- ESPnet2
To Reproduce
when i ran the stage 11 with slurm it showing error after sometime...
slurm.conf
#Default configuration
command sbatch --export=PATH
option name=* --job-name $0
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
option num_threads=1 --cpus-per-task 12 --ntasks-per-node=1
option num_nodes=* --nodes $0
option gpu=1 -p tgpu
option gpu=* -p tgpu --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU
#note: the --max-jobs-run option is supported as a special case
#by slurm.pl and you don't have to handle it in the config file.
#default cpu=1
$ sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
tgpu* up infinite 2 idle hp-[1-2]
$ scontrol show nodes
NodeName=hp-1 Arch=x86_64 CoresPerSocket=1
CPUAlloc=0 CPUErr=0 CPUTot=12 CPULoad=0.34
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:1
NodeAddr=hp-1 NodeHostName=hp-1 Version=17.11
OS=Linux 5.4.0-1079-oracle #87~18.04.1-Ubuntu SMP Mon Jul 11 03:41:03 UTC 2022
RealMemory=1 AllocMem=0 FreeMem=86991 Sockets=12 Boards=1
State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=tgpu
BootTime=2022-07-24T06:57:55 SlurmdStartTime=2022-07-24T10:10:49
CfgTRES=cpu=12,mem=1M,billing=12
AllocTRES=
CapWatts=n/a
CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
NodeName=hp-2 Arch=x86_64 CoresPerSocket=1
CPUAlloc=0 CPUErr=0 CPUTot=12 CPULoad=0.09
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:1
NodeAddr=hp-2 NodeHostName=hp-2 Version=17.11
OS=Linux 5.4.0-1079-oracle #87~18.04.1-Ubuntu SMP Mon Jul 11 03:41:03 UTC 2022
RealMemory=1 AllocMem=0 FreeMem=86953 Sockets=12 Boards=1
State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=tgpu
BootTime=2022-07-24T07:00:18 SlurmdStartTime=2022-07-24T10:15:26
CfgTRES=cpu=12,mem=1M,billing=12
AllocTRES=
CapWatts=n/a
CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
GPU utilization

Error logs
#Running on hp-1
#Started at Sat Jul 23 17:17:24 UTC 2022
#SLURMD_NODENAME=hp-1
#SLURM_CHECKPOINT_IMAGE_DIR=/var/slurm/checkpoint
#SLURM_CLUSTER_NAME=cluster
#SLURM_CPUS_ON_NODE=12
#SLURM_CPUS_PER_TASK=12
#SLURM_EXPORT_ENV=PATH
#SLURM_GET_USER_ENV=1
#SLURM_GTIDS=0
#SLURM_JOBID=70
#SLURM_JOB_CPUS_PER_NODE='12(x2)'
#SLURM_JOB_GID=1001
#SLURM_JOB_ID=70
#SLURM_JOB_NAME=test
#SLURM_JOB_NODELIST='hp-[1-2]'
#SLURM_JOB_NUM_NODES=2
#SLURM_JOB_PARTITION=tgpu
#SLURM_JOB_UID=1001
#SLURM_JOB_USER=ubuntu
#SLURM_LOCALID=0
#SLURM_NNODES=2
#SLURM_NODEID=0
#SLURM_NODELIST='hp-[1-2]'
#SLURM_NODE_ALIASES='(null)'
#SLURM_NPROCS=2
#SLURM_NTASKS=2
#SLURM_NTASKS_PER_NODE=1
#SLURM_OPEN_MODE=a
#SLURM_PRIO_PROCESS=0
#SLURM_PROCID=0
#SLURM_SUBMIT_DIR=/home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1
#SLURM_SUBMIT_HOST=hp-1
#SLURM_TASKS_PER_NODE='1(x2)'
#SLURM_TASK_PID=28524
#SLURM_TOPOLOGY_ADDR=hp-1
#SLURM_TOPOLOGY_ADDR_PATTERN=node
#SLURM_WORKING_CLUSTER=cluster:155.248.167.102:6817:8192
#srun --export=ALL srun -N2 python3 -m espnet2.bin.asr_train --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/.dist_init_+SJraOwsjSi9F2aB --use_preprocessor true --bpemodel /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/bpe.model --token_type bpe --token_list /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/wav.scp,speech,sound --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/text,text,text --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/speech_shape --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/text_shape.bpe --resume false --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm --config /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/conf/cf2.yaml --frontend_conf fs=8k --normalize=global_mvn --normalize_conf stats_file=/home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/feats_stats.npz --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/wav.scp,speech,sound --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/text,text,text --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/speech_shape --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/text_shape.bpe --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm/.dist_init_a71d8596-0515-49d8-8cff-e85faece2c90
/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3 /home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/.dist_init_+SJraOwsjSi9F2aB --use_preprocessor true --bpemodel /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/bpe.model --token_type bpe --token_list /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/wav.scp,speech,sound --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/text,text,text --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/speech_shape --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/text_shape.bpe --resume false --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm --config /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/conf/cf2.yaml --frontend_conf fs=8k --normalize=global_mvn --normalize_conf stats_file=/home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/feats_stats.npz --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/wav.scp,speech,sound --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/text,text,text --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/speech_shape --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/text_shape.bpe --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm/.dist_init_a71d8596-0515-49d8-8cff-e85faece2c90
/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3 /home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/.dist_init_+SJraOwsjSi9F2aB --use_preprocessor true --bpemodel /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/bpe.model --token_type bpe --token_list /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/wav.scp,speech,sound --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/text,text,text --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/speech_shape --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/text_shape.bpe --resume false --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm --config /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/conf/cf2.yaml --frontend_conf fs=8k --normalize=global_mvn --normalize_conf stats_file=/home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/feats_stats.npz --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/wav.scp,speech,sound --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/text,text,text --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/speech_shape --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/text_shape.bpe --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm/.dist_init_a71d8596-0515-49d8-8cff-e85faece2c90
/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3 /home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/.dist_init_+SJraOwsjSi9F2aB --use_preprocessor true --bpemodel /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/bpe.model --token_type bpe --token_list /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/wav.scp,speech,sound --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/text,text,text --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/speech_shape --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/text_shape.bpe --resume false --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm --config /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/conf/cf2.yaml --frontend_conf fs=8k --normalize=global_mvn --normalize_conf stats_file=/home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/feats_stats.npz --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/wav.scp,speech,sound --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/text,text,text --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/speech_shape --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/text_shape.bpe --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm/.dist_init_a71d8596-0515-49d8-8cff-e85faece2c90
/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3 /home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/.dist_init_+SJraOwsjSi9F2aB --use_preprocessor true --bpemodel /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/bpe.model --token_type bpe --token_list /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/data/en_token_list/bpe_unigram5000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/wav.scp,speech,sound --valid_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/dev/text,text,text --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/speech_shape --valid_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//valid/text_shape.bpe --resume false --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm --config /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/conf/cf2.yaml --frontend_conf fs=8k --normalize=global_mvn --normalize_conf stats_file=/home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/feats_stats.npz --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/wav.scp,speech,sound --train_data_path_and_name_and_type /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/dump/raw/train_100/text,text,text --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/speech_shape --train_shape_file /home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_stats_raw_en_bpe5000//train/text_shape.bpe --ngpu 1 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///home/ubuntu/users/himanshu/espnet/egs2/librispeech/asr1/exp/asr_conformer_lr2e-3_8k_nospec_warmup25k_amp_nondeterministic_slurm/.dist_init_a71d8596-0515-49d8-8cff-e85faece2c90
WARNING:root:Using legacy_rel_pos and it will be deprecated in the future.
WARNING:root:Using legacy_rel_pos and it will be deprecated in the future.
WARNING:root:Using legacy_rel_pos and it will be deprecated in the future.
WARNING:root:Using legacy_rel_pos and it will be deprecated in the future.
WARNING:root:Using legacy_rel_selfattn and it will be deprecated in the future.
WARNING:root:Using legacy_rel_selfattn and it will be deprecated in the future.
WARNING:root:Using legacy_rel_selfattn and it will be deprecated in the future.
WARNING:root:Using legacy_rel_selfattn and it will be deprecated in the future.
hp-1:28603:28603 [0] NCCL INFO Bootstrap : Using ens3:10.0.0.27<0>
hp-1:28603:28603 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
hp-1:28603:28603 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
hp-1:28603:28603 [0] NCCL INFO NET/Socket : Using [0]ens3:10.0.0.27<0>
hp-1:28603:28603 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda10.2
hp-1:28608:28608 [0] NCCL INFO Bootstrap : Using ens3:10.0.0.27<0>
hp-1:28608:28608 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
hp-1:28608:28608 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
hp-1:28608:28608 [0] NCCL INFO NET/Socket : Using [0]ens3:10.0.0.27<0>
hp-1:28608:28608 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda10.2
Traceback (most recent call last):
File "/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py", line 23, in
main()
File "/home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py", line 19, in main
ASRTask.main(cmd=cmd)
File "/home/ubuntu/users/himanshu/espnet/espnet2/tasks/abs_task.py", line 1013, in main
cls.main_worker(args)
File "/home/ubuntu/users/himanshu/espnet/espnet2/tasks/abs_task.py", line 1309, in main_worker
cls.trainer.run(
File "/home/ubuntu/users/himanshu/espnet/espnet2/train/trainer.py", line 220, in run
dp_model = torch.nn.parallel.DistributedDataParallel(
File "/home/ubuntu/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 646, in init
_verify_param_shape_across_processes(self.process_group, parameters)
File "/home/ubuntu/.local/lib/python3.9/site-packages/torch/distributed/utils.py", line 89, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: [1] is setting up NCCL communicator and retreiving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: Timeout waiting for key: default_pg/0/0 after 1800000 ms
Exception raised from get at ../torch/csrc/distributed/c10d/FileStore.cpp:362 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f03a5bba612 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x5b (0x7f03a5bb6cab in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libc10.so)
frame #2: c10d::FileStore::get(std::string const&) + 0xb09 (0x7f03da1ce739 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #3: c10d::PrefixStore::get(std::string const&) + 0x32 (0x7f03da1d13c2 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #4: c10d::PrefixStore::get(std::string const&) + 0x32 (0x7f03da1d13c2 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #5: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xb1 (0x7f03a6ffa301 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so)
frame #6: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, std::vector<c10::Device, std::allocatorc10::Device > const&, c10d::OpType, int, bool) + 0x204 (0x7f03a6ffe794 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so)
frame #7: c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor, std::allocatorat::Tensor >, std::allocator<std::vector<at::Tensor, std::allocatorat::Tensor > > >&, std::vector<at::Tensor, std::allocatorat::Tensor >&, c10d::AllgatherOptions const&) + 0x34b (0x7f03a700c7db in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so)
frame #8: c10d::verify_params_across_processes(c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_typec10d::ProcessGroup > const&, std::vector<at::Tensor, std::allocatorat::Tensor > const&, c10::optional<std::weak_ptrc10d::Logger > const&) + 0x3f5 (0x7f03da21b825 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #9: + 0x87cebc (0x7f03ef97debc in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_python.so)
frame #10: + 0x21ebc5 (0x7f03ef31fbc5 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_python.so)
frame #11: + 0x1828f4 (0x55f7867078f4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #12: _PyObject_MakeTpCall + 0x2df (0x55f7866c147f in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #13: _PyEval_EvalFrameDefault + 0x49a9 (0x55f78675f2e9 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #14: + 0x196fe3 (0x55f78671bfe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #15: _PyFunction_Vectorcall + 0x1d4 (0x55f78671ccb4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #16: + 0xfe088 (0x55f786683088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #17: + 0x196fe3 (0x55f78671bfe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #18: _PyFunction_Vectorcall + 0x244 (0x55f78671cd24 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #19: _PyObject_FastCallDictTstate + 0xee (0x55f786707a2e in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #20: + 0x18c429 (0x55f786711429 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #21: _PyObject_MakeTpCall + 0x38f (0x55f7866c152f in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #22: _PyEval_EvalFrameDefault + 0x1350 (0x55f78675bc90 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #23: + 0x196fe3 (0x55f78671bfe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #24: + 0x198709 (0x55f78671d709 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #25: + 0xfe73d (0x55f78668373d in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #26: + 0x198559 (0x55f78671d559 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #27: + 0xff300 (0x55f786684300 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #28: + 0x196fe3 (0x55f78671bfe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #29: + 0x198709 (0x55f78671d709 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #30: + 0xfe73d (0x55f78668373d in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #31: + 0x231418 (0x55f7867b6418 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #32: + 0xfe088 (0x55f786683088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #33: + 0x196fe3 (0x55f78671bfe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #34: PyEval_EvalCodeEx + 0x4c (0x55f7867c8a7c in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #35: PyEval_EvalCode + 0x1b (0x55f78671cdbb in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #36: + 0x27a33e (0x55f7867ff33e in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #37: + 0x1a1571 (0x55f786726571 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #38: + 0xfe088 (0x55f786683088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #39: + 0x196fe3 (0x55f78671bfe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #40: _PyFunction_Vectorcall + 0x1d4 (0x55f78671ccb4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #41: + 0xfe088 (0x55f786683088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #42: + 0x196fe3 (0x55f78671bfe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #43: _PyFunction_Vectorcall + 0x1d4 (0x55f78671ccb4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #44: _PyObject_Call + 0x1da (0x55f7866cb30a in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #45: + 0x274eaa (0x55f7867f9eaa in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #46: Py_RunMain + 0x18f (0x55f7867fec0f in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #47: Py_BytesMain + 0x39 (0x55f7867feff9 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #48: __libc_start_main + 0xe7 (0x7f0416b22c87 in /lib/x86_64-linux-gnu/libc.so.6)
frame #49: + 0x2016a0 (0x55f7867866a0 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
Traceback (most recent call last):
File "/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py", line 23, in
main()
File "/home/ubuntu/users/himanshu/espnet/espnet2/bin/asr_train.py", line 19, in main
ASRTask.main(cmd=cmd)
File "/home/ubuntu/users/himanshu/espnet/espnet2/tasks/abs_task.py", line 1013, in main
cls.main_worker(args)
File "/home/ubuntu/users/himanshu/espnet/espnet2/tasks/abs_task.py", line 1309, in main_worker
cls.trainer.run(
File "/home/ubuntu/users/himanshu/espnet/espnet2/train/trainer.py", line 220, in run
dp_model = torch.nn.parallel.DistributedDataParallel(
File "/home/ubuntu/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 646, in init
_verify_param_shape_across_processes(self.process_group, parameters)
File "/home/ubuntu/.local/lib/python3.9/site-packages/torch/distributed/utils.py", line 89, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: [1] is setting up NCCL communicator and retreiving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: Timeout waiting for key: default_pg/0/0 after 1800000 ms
Exception raised from get at ../torch/csrc/distributed/c10d/FileStore.cpp:362 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fa47e37a612 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x5b (0x7fa47e376cab in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libc10.so)
frame #2: c10d::FileStore::get(std::string const&) + 0xb09 (0x7fa4b298e739 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #3: c10d::PrefixStore::get(std::string const&) + 0x32 (0x7fa4b29913c2 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #4: c10d::PrefixStore::get(std::string const&) + 0x32 (0x7fa4b29913c2 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #5: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xb1 (0x7fa47f7ba301 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so)
frame #6: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, std::vector<c10::Device, std::allocatorc10::Device > const&, c10d::OpType, int, bool) + 0x204 (0x7fa47f7be794 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so)
frame #7: c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor, std::allocatorat::Tensor >, std::allocator<std::vector<at::Tensor, std::allocatorat::Tensor > > >&, std::vector<at::Tensor, std::allocatorat::Tensor >&, c10d::AllgatherOptions const&) + 0x34b (0x7fa47f7cc7db in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so)
frame #8: c10d::verify_params_across_processes(c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_typec10d::ProcessGroup > const&, std::vector<at::Tensor, std::allocatorat::Tensor > const&, c10::optional<std::weak_ptrc10d::Logger > const&) + 0x3f5 (0x7fa4b29db825 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_cpu.so)
frame #9: + 0x87cebc (0x7fa4c813debc in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_python.so)
frame #10: + 0x21ebc5 (0x7fa4c7adfbc5 in /home/ubuntu/.local/lib/python3.9/site-packages/torch/lib/libtorch_python.so)
frame #11: + 0x1828f4 (0x559e091508f4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #12: _PyObject_MakeTpCall + 0x2df (0x559e0910a47f in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #13: _PyEval_EvalFrameDefault + 0x49a9 (0x559e091a82e9 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #14: + 0x196fe3 (0x559e09164fe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #15: _PyFunction_Vectorcall + 0x1d4 (0x559e09165cb4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #16: + 0xfe088 (0x559e090cc088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #17: + 0x196fe3 (0x559e09164fe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #18: _PyFunction_Vectorcall + 0x244 (0x559e09165d24 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #19: _PyObject_FastCallDictTstate + 0xee (0x559e09150a2e in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #20: + 0x18c429 (0x559e0915a429 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #21: _PyObject_MakeTpCall + 0x38f (0x559e0910a52f in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #22: _PyEval_EvalFrameDefault + 0x1350 (0x559e091a4c90 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #23: + 0x196fe3 (0x559e09164fe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #24: + 0x198709 (0x559e09166709 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #25: + 0xfe73d (0x559e090cc73d in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #26: + 0x198559 (0x559e09166559 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #27: + 0xff300 (0x559e090cd300 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #28: + 0x196fe3 (0x559e09164fe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #29: + 0x198709 (0x559e09166709 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #30: + 0xfe73d (0x559e090cc73d in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #31: + 0x231418 (0x559e091ff418 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #32: + 0xfe088 (0x559e090cc088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #33: + 0x196fe3 (0x559e09164fe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #34: PyEval_EvalCodeEx + 0x4c (0x559e09211a7c in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #35: PyEval_EvalCode + 0x1b (0x559e09165dbb in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #36: + 0x27a33e (0x559e0924833e in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #37: + 0x1a1571 (0x559e0916f571 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #38: + 0xfe088 (0x559e090cc088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #39: + 0x196fe3 (0x559e09164fe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #40: _PyFunction_Vectorcall + 0x1d4 (0x559e09165cb4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #41: + 0xfe088 (0x559e090cc088 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #42: + 0x196fe3 (0x559e09164fe3 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #43: _PyFunction_Vectorcall + 0x1d4 (0x559e09165cb4 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #44: _PyObject_Call + 0x1da (0x559e0911430a in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #45: + 0x274eaa (0x559e09242eaa in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #46: Py_RunMain + 0x18f (0x559e09247c0f in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #47: Py_BytesMain + 0x39 (0x559e09247ff9 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
frame #48: __libc_start_main + 0xe7 (0x7fa4ef2e2c87 in /lib/x86_64-linux-gnu/libc.so.6)
frame #49: + 0x2016a0 (0x559e091cf6a0 in /home/ec2-user/SageMaker/espnet_env1/envs/espnet_env1/bin/python3)
srun: error: hp-2: task 1: Exited with exit code 1
srun: error: hp-2: task 1: Exited with exit code 1
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: got SIGCONT
srun: forcing job termination
srun: got SIGCONT
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: got SIGCONT
slurmstepd-hp-1: error: *** STEP 70.2 ON hp-1 CANCELLED AT 2022-07-23T18:02:02 ***
srun: forcing job termination
slurmstepd-hp-1: error: *** STEP 70.1 ON hp-1 CANCELLED AT 2022-07-23T18:02:02 ***
slurmstepd-hp-1: error: *** STEP 70.0 ON hp-1 CANCELLED AT 2022-07-23T18:02:02 ***
srun: forcing job termination