@inproceedings{e-mendoza-etal-2022-benchmark,
title = "Benchmark for Research Theme Classification of Scholarly Documents",
author = "E. Mendoza, {\'O}scar and
Kusa, Wojciech and
El-Ebshihy, Alaa and
Wu, Ronin and
Pride, David and
Knoth, Petr and
Herrmannova, Drahomira and
Piroi, Florina and
Pasi, Gabriella and
Hanbury, Allan",
editor = "Cohan, Arman and
Feigenblat, Guy and
Freitag, Dayne and
Ghosal, Tirthankar and
Herrmannova, Drahomira and
Knoth, Petr and
Lo, Kyle and
Mayr, Philipp and
Shmueli-Scheuer, Michal and
de Waard, Anita and
Wang, Lucy Lu",
booktitle = "Proceedings of the Third Workshop on Scholarly Document Processing",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.sdp-1.31",
pages = "253--262",
abstract = "We present a new gold-standard dataset and a benchmark for the Research Theme Identification task, a sub-task of the Scholarly Knowledge Graph Generation shared task, at the 3rd Workshop on Scholarly Document Processing. The objective of the shared task was to label given research papers with research themes from a total of 36 themes. The benchmark was compiled using data drawn from the largest overall assessment of university research output ever undertaken globally (the Research Excellence Framework - 2014). We provide a performance comparison of a transformer-based ensemble, which obtains multiple predictions for a research paper, given its multiple textual fields (e.g. title, abstract, reference), with traditional machine learning models. The ensemble involves enriching the initial data with additional information from open-access digital libraries and Argumentative Zoning techniques (CITATION). It uses a weighted sum aggregation for the multiple predictions to obtain a final single prediction for the given research paper. Both data and the ensemble are publicly available on \url{https://www.kaggle.com/competitions/sdp2022-scholarly-knowledge-graph-generation/data?select=task1_test_no_label.csv} and \url{https://github.com/ProjectDoSSIER/sdp2022}, respectively.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="e-mendoza-etal-2022-benchmark">
<titleInfo>
<title>Benchmark for Research Theme Classification of Scholarly Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Óscar</namePart>
<namePart type="family">E. Mendoza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wojciech</namePart>
<namePart type="family">Kusa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alaa</namePart>
<namePart type="family">El-Ebshihy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronin</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Pride</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Knoth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drahomira</namePart>
<namePart type="family">Herrmannova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Florina</namePart>
<namePart type="family">Piroi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriella</namePart>
<namePart type="family">Pasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Allan</namePart>
<namePart type="family">Hanbury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Scholarly Document Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Cohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Feigenblat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayne</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drahomira</namePart>
<namePart type="family">Herrmannova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Knoth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Mayr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli-Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anita</namePart>
<namePart type="family">de Waard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucy</namePart>
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a new gold-standard dataset and a benchmark for the Research Theme Identification task, a sub-task of the Scholarly Knowledge Graph Generation shared task, at the 3rd Workshop on Scholarly Document Processing. The objective of the shared task was to label given research papers with research themes from a total of 36 themes. The benchmark was compiled using data drawn from the largest overall assessment of university research output ever undertaken globally (the Research Excellence Framework - 2014). We provide a performance comparison of a transformer-based ensemble, which obtains multiple predictions for a research paper, given its multiple textual fields (e.g. title, abstract, reference), with traditional machine learning models. The ensemble involves enriching the initial data with additional information from open-access digital libraries and Argumentative Zoning techniques (CITATION). It uses a weighted sum aggregation for the multiple predictions to obtain a final single prediction for the given research paper. Both data and the ensemble are publicly available on https://www.kaggle.com/competitions/sdp2022-scholarly-knowledge-graph-generation/data?select=task1_test_no_label.csv and https://github.com/ProjectDoSSIER/sdp2022, respectively.</abstract>
<identifier type="citekey">e-mendoza-etal-2022-benchmark</identifier>
<location>
<url>https://aclanthology.org/2022.sdp-1.31</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>253</start>
<end>262</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmark for Research Theme Classification of Scholarly Documents
%A E. Mendoza, Óscar
%A Kusa, Wojciech
%A El-Ebshihy, Alaa
%A Wu, Ronin
%A Pride, David
%A Knoth, Petr
%A Herrmannova, Drahomira
%A Piroi, Florina
%A Pasi, Gabriella
%A Hanbury, Allan
%Y Cohan, Arman
%Y Feigenblat, Guy
%Y Freitag, Dayne
%Y Ghosal, Tirthankar
%Y Herrmannova, Drahomira
%Y Knoth, Petr
%Y Lo, Kyle
%Y Mayr, Philipp
%Y Shmueli-Scheuer, Michal
%Y de Waard, Anita
%Y Wang, Lucy Lu
%S Proceedings of the Third Workshop on Scholarly Document Processing
%D 2022
%8 October
%I Association for Computational Linguistics
%C Gyeongju, Republic of Korea
%F e-mendoza-etal-2022-benchmark
%X We present a new gold-standard dataset and a benchmark for the Research Theme Identification task, a sub-task of the Scholarly Knowledge Graph Generation shared task, at the 3rd Workshop on Scholarly Document Processing. The objective of the shared task was to label given research papers with research themes from a total of 36 themes. The benchmark was compiled using data drawn from the largest overall assessment of university research output ever undertaken globally (the Research Excellence Framework - 2014). We provide a performance comparison of a transformer-based ensemble, which obtains multiple predictions for a research paper, given its multiple textual fields (e.g. title, abstract, reference), with traditional machine learning models. The ensemble involves enriching the initial data with additional information from open-access digital libraries and Argumentative Zoning techniques (CITATION). It uses a weighted sum aggregation for the multiple predictions to obtain a final single prediction for the given research paper. Both data and the ensemble are publicly available on https://www.kaggle.com/competitions/sdp2022-scholarly-knowledge-graph-generation/data?select=task1_test_no_label.csv and https://github.com/ProjectDoSSIER/sdp2022, respectively.
%U https://aclanthology.org/2022.sdp-1.31
%P 253-262
Markdown (Informal)
[Benchmark for Research Theme Classification of Scholarly Documents](https://aclanthology.org/2022.sdp-1.31) (E. Mendoza et al., sdp 2022)
ACL
- Óscar E. Mendoza, Wojciech Kusa, Alaa El-Ebshihy, Ronin Wu, David Pride, Petr Knoth, Drahomira Herrmannova, Florina Piroi, Gabriella Pasi, and Allan Hanbury. 2022. Benchmark for Research Theme Classification of Scholarly Documents. In Proceedings of the Third Workshop on Scholarly Document Processing, pages 253–262, Gyeongju, Republic of Korea. Association for Computational Linguistics.