数据集:
ccdv/WCEP-10
Summarization dataset copied from PRIMERA
This dataset is compatible with the run_summarization.py script from Transformers if you add this line to the summarization_name_mapping variable:
"ccdv/WCEP-10": ("document", "summary")
4 possibles configs:
This dataset has 3 splits: train , validation , and test . \
Dataset Split | Number of Instances |
---|---|
Train | 8158 |
Validation | 1020 |
Test | 1022 |
@article{DBLP:journals/corr/abs-2005-10070, author = {Demian Gholipour Ghalandari and Chris Hokamp and Nghia The Pham and John Glover and Georgiana Ifrim}, title = {A Large-Scale Multi-Document Summarization Dataset from the Wikipedia Current Events Portal}, journal = {CoRR}, volume = {abs/2005.10070}, year = {2020}, url = {https://arxiv.org/abs/2005.10070}, eprinttype = {arXiv}, eprint = {2005.10070}, timestamp = {Fri, 22 May 2020 16:21:28 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-2005-10070.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{DBLP:journals/corr/abs-2110-08499, author = {Wen Xiao and Iz Beltagy and Giuseppe Carenini and Arman Cohan}, title = {{PRIMER:} Pyramid-based Masked Sentence Pre-training for Multi-document Summarization}, journal = {CoRR}, volume = {abs/2110.08499}, year = {2021}, url = {https://arxiv.org/abs/2110.08499}, eprinttype = {arXiv}, eprint = {2110.08499}, timestamp = {Fri, 22 Oct 2021 13:33:09 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-2110-08499.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }