From 5276bde711e1ba91172204374f58ba19319bb851 Mon Sep 17 00:00:00 2001 From: chmodsss Date: Thu, 9 Jan 2020 17:47:55 +0100 Subject: [PATCH 1/8] fixed path variables and hint for windows users --- README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2db0ec98..72ce2c47 100644 --- a/README.md +++ b/README.md @@ -101,10 +101,13 @@ export CLASSPATH=/path/to/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8. ``` replacing `/path/to/` with the path to where you saved the `stanford-corenlp-full-2017-06-09` directory. +NOTE for windows users: +use %WHATEVER_PATH% instead of $WHATEVER_PATH + #### Step 3. Sentence Splitting and Tokenization ``` -python preprocess.py -mode tokenize -raw_path RAW_PATH -save_path TOKENIZED_PATH +python preprocess.py -mode tokenize -raw_path $RAW_PATH -save_path $JSON_PATH ``` * `RAW_PATH` is the directory containing story files (`../raw_stories`), `JSON_PATH` is the target directory to save the generated json files (`../merged_stories_tokenized`) @@ -113,14 +116,14 @@ python preprocess.py -mode tokenize -raw_path RAW_PATH -save_path TOKENIZED_PATH #### Step 4. Format to Simpler Json Files ``` -python preprocess.py -mode format_to_lines -raw_path RAW_PATH -save_path JSON_PATH -n_cpus 1 -use_bert_basic_tokenizer false -map_path MAP_PATH +python preprocess.py -mode format_to_lines -raw_path $RAW_PATH -save_path $JSON_PATH -n_cpus 1 -use_bert_basic_tokenizer false -map_path $MAP_PATH ``` * `RAW_PATH` is the directory containing tokenized files (`../merged_stories_tokenized`), `JSON_PATH` is the target directory to save the generated json files (`../json_data/cnndm`), `MAP_PATH` is the directory containing the urls files (`../urls`) #### Step 5. Format to PyTorch Files ``` -python preprocess.py -mode format_to_bert -raw_path JSON_PATH -save_path BERT_DATA_PATH -lower -n_cpus 1 -log_file ../logs/preprocess.log +python preprocess.py -mode format_to_bert -raw_path $JSON_PATH -save_path $BERT_DATA_PATH -lower -n_cpus 1 -log_file ../logs/preprocess.log ``` * `JSON_PATH` is the directory containing json files (`../json_data`), `BERT_DATA_PATH` is the target directory to save the generated binary files (`../bert_data`) @@ -132,22 +135,22 @@ python preprocess.py -mode format_to_bert -raw_path JSON_PATH -save_path BERT_DA ### Extractive Setting ``` -python train.py -task ext -mode train -bert_data_path BERT_DATA_PATH -ext_dropout 0.1 -model_path MODEL_PATH -lr 2e-3 -visible_gpus 0,1,2 -report_every 50 -save_checkpoint_steps 1000 -batch_size 3000 -train_steps 50000 -accum_count 2 -log_file ../logs/ext_bert_cnndm -use_interval true -warmup_steps 10000 -max_pos 512 +python train.py -task ext -mode train -bert_data_path $BERT_DATA_PATH -ext_dropout 0.1 -model_path $MODEL_PATH -lr 2e-3 -visible_gpus 0,1,2 -report_every 50 -save_checkpoint_steps 1000 -batch_size 3000 -train_steps 50000 -accum_count 2 -log_file ../logs/ext_bert_cnndm -use_interval true -warmup_steps 10000 -max_pos 512 ``` ### Abstractive Setting #### TransformerAbs (baseline) ``` -python train.py -mode train -accum_count 5 -batch_size 300 -bert_data_path BERT_DATA_PATH -dec_dropout 0.1 -log_file ../../logs/cnndm_baseline -lr 0.05 -model_path MODEL_PATH -save_checkpoint_steps 2000 -seed 777 -sep_optim false -train_steps 200000 -use_bert_emb true -use_interval true -warmup_steps 8000 -visible_gpus 0,1,2,3 -max_pos 512 -report_every 50 -enc_hidden_size 512 -enc_layers 6 -enc_ff_size 2048 -enc_dropout 0.1 -dec_layers 6 -dec_hidden_size 512 -dec_ff_size 2048 -encoder baseline -task abs +python train.py -mode train -accum_count 5 -batch_size 300 -bert_data_path $BERT_DATA_PATH -dec_dropout 0.1 -log_file ../../logs/cnndm_baseline -lr 0.05 -model_path $MODEL_PATH -save_checkpoint_steps 2000 -seed 777 -sep_optim false -train_steps 200000 -use_bert_emb true -use_interval true -warmup_steps 8000 -visible_gpus 0,1,2,3 -max_pos 512 -report_every 50 -enc_hidden_size 512 -enc_layers 6 -enc_ff_size 2048 -enc_dropout 0.1 -dec_layers 6 -dec_hidden_size 512 -dec_ff_size 2048 -encoder baseline -task abs ``` #### BertAbs ``` -python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropout 0.2 -model_path MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm +python train.py -task abs -mode train -bert_data_path $BERT_DATA_PATH -dec_dropout 0.2 -model_path $MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm ``` #### BertExtAbs ``` -python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropout 0.2 -model_path MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm -load_from_extractive EXT_CKPT +python train.py -task abs -mode train -bert_data_path $BERT_DATA_PATH -dec_dropout 0.2 -model_path $MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm -load_from_extractive $EXT_CKPT ``` * `EXT_CKPT` is the saved `.pt` checkpoint of the extractive model. @@ -157,11 +160,11 @@ python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropo ## Model Evaluation ### CNN/DM ``` - python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -max_length 200 -alpha 0.95 -min_length 50 -result_path ../logs/abs_bert_cnndm + python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path $BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path $MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -max_length 200 -alpha 0.95 -min_length 50 -result_path ../logs/abs_bert_cnndm ``` ### XSum ``` - python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -min_length 20 -max_length 100 -alpha 0.9 -result_path ../logs/abs_bert_cnndm + python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path $BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path $MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -min_length 20 -max_length 100 -alpha 0.9 -result_path ../logs/abs_bert_cnndm ``` * `-mode` can be {`validate, test`}, where `validate` will inspect the model directory and evaluate the model for each newly saved checkpoint, `test` need to be used with `-test_from`, indicating the checkpoint you want to use * `MODEL_PATH` is the directory of saved checkpoints From 70d7a324923e7d0e229af592013bf3b60ccc82ef Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Wed, 22 Jan 2020 19:13:54 +0000 Subject: [PATCH 2/8] update Readme for summarizing raw text input --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 2db0ec98..a40ee68b 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,15 @@ **This code is for EMNLP 2019 paper [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345)** +**Updates Jan 22 2020**: Now you can **Summarize Raw Text Input!**. Swith to the dev branch, and use `-text_src $RAW_SRC.TXT` to input your text file. +* use `-test_from $PT_FILE$` to use your model checkpoint file. +* Format of the source text file: + * For **abstractive summarization**, each line is a document. + * If you want to do **extractive summarization**, please insert ` [CLS] [SEP] ` as your sentence boundaries. +* There are example input files in the [raw_data directory](https://github.com/nlpyang/PreSumm/tree/dev/raw_data) +* If you also have reference summaries aligned with your source input, please use `-text_tgt $RAW_TGT.TXT` to keep the order for evaluation. + + Results on CNN/DailyMail (20/8/2019): @@ -60,6 +69,8 @@ Results on CNN/DailyMail (20/8/2019): **Package Requirements**: torch==1.1.0 pytorch_transformers tensorboardX multiprocess pyrouge + + **Updates**: For encoding a text longer than 512 tokens, for example 800. Set max_pos to 800 during both preprocessing and training. From 2df3312582a3a014aacbc1be810841705c67d06e Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Sat, 25 Jan 2020 19:14:36 +0000 Subject: [PATCH 3/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a40ee68b..c1800669 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **This code is for EMNLP 2019 paper [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345)** -**Updates Jan 22 2020**: Now you can **Summarize Raw Text Input!**. Swith to the dev branch, and use `-text_src $RAW_SRC.TXT` to input your text file. +**Updates Jan 22 2020**: Now you can **Summarize Raw Text Input!**. Swith to the dev branch, and use `-mode test_text` and use `-text_src $RAW_SRC.TXT` to input your text file. * use `-test_from $PT_FILE$` to use your model checkpoint file. * Format of the source text file: * For **abstractive summarization**, each line is a document. From 737354d8fdd5da22cf1427b3dd56753982d225b1 Mon Sep 17 00:00:00 2001 From: chmodsss Date: Fri, 14 Feb 2020 11:10:20 +0100 Subject: [PATCH 4/8] regex for listing bert model pt files fixed --- src/models/data_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/data_loader.py b/src/models/data_loader.py index 6f3e38fe..551013a3 100644 --- a/src/models/data_loader.py +++ b/src/models/data_loader.py @@ -81,7 +81,7 @@ def _lazy_dataset_loader(pt_file, corpus_type): return dataset # Sort the glob output by file name (by increasing indexes). - pts = sorted(glob.glob(args.bert_data_path + '.' + corpus_type + '.[0-9]*.pt')) + pts = sorted(glob.glob(args.bert_data_path + '*' + corpus_type + '.[0-9]*.pt')) if pts: if (shuffle): random.shuffle(pts) @@ -90,7 +90,7 @@ def _lazy_dataset_loader(pt_file, corpus_type): yield _lazy_dataset_loader(pt, corpus_type) else: # Only one inputters.*Dataset, simple! - pt = args.bert_data_path + '.' + corpus_type + '.pt' + pt = args.bert_data_path + '*' + corpus_type + '.pt' yield _lazy_dataset_loader(pt, corpus_type) From 3bc52e30440b8dd959e554d04d6dc5a990cc39f7 Mon Sep 17 00:00:00 2001 From: chmodsss Date: Fri, 14 Feb 2020 13:24:04 +0100 Subject: [PATCH 5/8] gitignore added for temp folder and rest --- .gitignore | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..8f4297ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,137 @@ +# temp folder for bert files +temp/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ From c93c36537248d238a24f59bba637e0d328f6e7f3 Mon Sep 17 00:00:00 2001 From: chmodsss Date: Thu, 9 Jan 2020 17:47:55 +0100 Subject: [PATCH 6/8] fixed path variables and hint for windows users --- README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c1800669..24317429 100644 --- a/README.md +++ b/README.md @@ -112,10 +112,13 @@ export CLASSPATH=/path/to/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8. ``` replacing `/path/to/` with the path to where you saved the `stanford-corenlp-full-2017-06-09` directory. +NOTE for windows users: +use %WHATEVER_PATH% instead of $WHATEVER_PATH + #### Step 3. Sentence Splitting and Tokenization ``` -python preprocess.py -mode tokenize -raw_path RAW_PATH -save_path TOKENIZED_PATH +python preprocess.py -mode tokenize -raw_path $RAW_PATH -save_path $JSON_PATH ``` * `RAW_PATH` is the directory containing story files (`../raw_stories`), `JSON_PATH` is the target directory to save the generated json files (`../merged_stories_tokenized`) @@ -124,14 +127,14 @@ python preprocess.py -mode tokenize -raw_path RAW_PATH -save_path TOKENIZED_PATH #### Step 4. Format to Simpler Json Files ``` -python preprocess.py -mode format_to_lines -raw_path RAW_PATH -save_path JSON_PATH -n_cpus 1 -use_bert_basic_tokenizer false -map_path MAP_PATH +python preprocess.py -mode format_to_lines -raw_path $RAW_PATH -save_path $JSON_PATH -n_cpus 1 -use_bert_basic_tokenizer false -map_path $MAP_PATH ``` * `RAW_PATH` is the directory containing tokenized files (`../merged_stories_tokenized`), `JSON_PATH` is the target directory to save the generated json files (`../json_data/cnndm`), `MAP_PATH` is the directory containing the urls files (`../urls`) #### Step 5. Format to PyTorch Files ``` -python preprocess.py -mode format_to_bert -raw_path JSON_PATH -save_path BERT_DATA_PATH -lower -n_cpus 1 -log_file ../logs/preprocess.log +python preprocess.py -mode format_to_bert -raw_path $JSON_PATH -save_path $BERT_DATA_PATH -lower -n_cpus 1 -log_file ../logs/preprocess.log ``` * `JSON_PATH` is the directory containing json files (`../json_data`), `BERT_DATA_PATH` is the target directory to save the generated binary files (`../bert_data`) @@ -143,22 +146,22 @@ python preprocess.py -mode format_to_bert -raw_path JSON_PATH -save_path BERT_DA ### Extractive Setting ``` -python train.py -task ext -mode train -bert_data_path BERT_DATA_PATH -ext_dropout 0.1 -model_path MODEL_PATH -lr 2e-3 -visible_gpus 0,1,2 -report_every 50 -save_checkpoint_steps 1000 -batch_size 3000 -train_steps 50000 -accum_count 2 -log_file ../logs/ext_bert_cnndm -use_interval true -warmup_steps 10000 -max_pos 512 +python train.py -task ext -mode train -bert_data_path $BERT_DATA_PATH -ext_dropout 0.1 -model_path $MODEL_PATH -lr 2e-3 -visible_gpus 0,1,2 -report_every 50 -save_checkpoint_steps 1000 -batch_size 3000 -train_steps 50000 -accum_count 2 -log_file ../logs/ext_bert_cnndm -use_interval true -warmup_steps 10000 -max_pos 512 ``` ### Abstractive Setting #### TransformerAbs (baseline) ``` -python train.py -mode train -accum_count 5 -batch_size 300 -bert_data_path BERT_DATA_PATH -dec_dropout 0.1 -log_file ../../logs/cnndm_baseline -lr 0.05 -model_path MODEL_PATH -save_checkpoint_steps 2000 -seed 777 -sep_optim false -train_steps 200000 -use_bert_emb true -use_interval true -warmup_steps 8000 -visible_gpus 0,1,2,3 -max_pos 512 -report_every 50 -enc_hidden_size 512 -enc_layers 6 -enc_ff_size 2048 -enc_dropout 0.1 -dec_layers 6 -dec_hidden_size 512 -dec_ff_size 2048 -encoder baseline -task abs +python train.py -mode train -accum_count 5 -batch_size 300 -bert_data_path $BERT_DATA_PATH -dec_dropout 0.1 -log_file ../../logs/cnndm_baseline -lr 0.05 -model_path $MODEL_PATH -save_checkpoint_steps 2000 -seed 777 -sep_optim false -train_steps 200000 -use_bert_emb true -use_interval true -warmup_steps 8000 -visible_gpus 0,1,2,3 -max_pos 512 -report_every 50 -enc_hidden_size 512 -enc_layers 6 -enc_ff_size 2048 -enc_dropout 0.1 -dec_layers 6 -dec_hidden_size 512 -dec_ff_size 2048 -encoder baseline -task abs ``` #### BertAbs ``` -python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropout 0.2 -model_path MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm +python train.py -task abs -mode train -bert_data_path $BERT_DATA_PATH -dec_dropout 0.2 -model_path $MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm ``` #### BertExtAbs ``` -python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropout 0.2 -model_path MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm -load_from_extractive EXT_CKPT +python train.py -task abs -mode train -bert_data_path $BERT_DATA_PATH -dec_dropout 0.2 -model_path $MODEL_PATH -sep_optim true -lr_bert 0.002 -lr_dec 0.2 -save_checkpoint_steps 2000 -batch_size 140 -train_steps 200000 -report_every 50 -accum_count 5 -use_bert_emb true -use_interval true -warmup_steps_bert 20000 -warmup_steps_dec 10000 -max_pos 512 -visible_gpus 0,1,2,3 -log_file ../logs/abs_bert_cnndm -load_from_extractive $EXT_CKPT ``` * `EXT_CKPT` is the saved `.pt` checkpoint of the extractive model. @@ -168,11 +171,11 @@ python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropo ## Model Evaluation ### CNN/DM ``` - python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -max_length 200 -alpha 0.95 -min_length 50 -result_path ../logs/abs_bert_cnndm + python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path $BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path $MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -max_length 200 -alpha 0.95 -min_length 50 -result_path ../logs/abs_bert_cnndm ``` ### XSum ``` - python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -min_length 20 -max_length 100 -alpha 0.9 -result_path ../logs/abs_bert_cnndm + python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path $BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path $MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -min_length 20 -max_length 100 -alpha 0.9 -result_path ../logs/abs_bert_cnndm ``` * `-mode` can be {`validate, test`}, where `validate` will inspect the model directory and evaluate the model for each newly saved checkpoint, `test` need to be used with `-test_from`, indicating the checkpoint you want to use * `MODEL_PATH` is the directory of saved checkpoints From 1a313a8fdd8ac28a6605accd9349d87de12b5fea Mon Sep 17 00:00:00 2001 From: chmodsss Date: Fri, 14 Feb 2020 11:10:20 +0100 Subject: [PATCH 7/8] regex for listing bert model pt files fixed --- src/models/data_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/data_loader.py b/src/models/data_loader.py index 6f3e38fe..551013a3 100644 --- a/src/models/data_loader.py +++ b/src/models/data_loader.py @@ -81,7 +81,7 @@ def _lazy_dataset_loader(pt_file, corpus_type): return dataset # Sort the glob output by file name (by increasing indexes). - pts = sorted(glob.glob(args.bert_data_path + '.' + corpus_type + '.[0-9]*.pt')) + pts = sorted(glob.glob(args.bert_data_path + '*' + corpus_type + '.[0-9]*.pt')) if pts: if (shuffle): random.shuffle(pts) @@ -90,7 +90,7 @@ def _lazy_dataset_loader(pt_file, corpus_type): yield _lazy_dataset_loader(pt, corpus_type) else: # Only one inputters.*Dataset, simple! - pt = args.bert_data_path + '.' + corpus_type + '.pt' + pt = args.bert_data_path + '*' + corpus_type + '.pt' yield _lazy_dataset_loader(pt, corpus_type) From 0d84c7a566ae61432a2895703e32cd93d0b71875 Mon Sep 17 00:00:00 2001 From: chmodsss Date: Fri, 14 Feb 2020 13:24:04 +0100 Subject: [PATCH 8/8] gitignore added for temp folder and rest --- .gitignore | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..8f4297ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,137 @@ +# temp folder for bert files +temp/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/