@inproceedings{zouhar-etal-2024-pitfalls,
title = "Pitfalls and Outlooks in Using {COMET}",
author = "Zouhar, Vil{\'e}m and
Chen, Pinzhen and
Lam, Tsz Kin and
Moghe, Nikita and
Haddow, Barry",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.121",
doi = "10.18653/v1/2024.wmt-1.121",
pages = "1272--1288",
abstract = "The COMET metric has blazed a trail in the machine translation community, given its strong correlation with human judgements of translation quality.Its success stems from being a modified pre-trained multilingual model finetuned for quality assessment.However, it being a machine learning model also gives rise to a new set of pitfalls that may not be widely known. We investigate these unexpected behaviours from three aspects:1) technical: obsolete software versions and compute precision; 2) data: empty content, language mismatch, and translationese at test time as well as distribution and domain biases in training; 3) usage and reporting: multi-reference support and model referencing in the literature. All of these problems imply that COMET scores are not comparable between papers or even technical setups and we put forward our perspective on fixing each issue.Furthermore, we release the sacreCOMET package that can generate a signature for the software and model configuration as well as an appropriate citation.The goal of this work is to help the community make more sound use of the COMET metric.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zouhar-etal-2024-pitfalls">
<titleInfo>
<title>Pitfalls and Outlooks in Using COMET</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vilém</namePart>
<namePart type="family">Zouhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsz</namePart>
<namePart type="given">Kin</namePart>
<namePart type="family">Lam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikita</namePart>
<namePart type="family">Moghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The COMET metric has blazed a trail in the machine translation community, given its strong correlation with human judgements of translation quality.Its success stems from being a modified pre-trained multilingual model finetuned for quality assessment.However, it being a machine learning model also gives rise to a new set of pitfalls that may not be widely known. We investigate these unexpected behaviours from three aspects:1) technical: obsolete software versions and compute precision; 2) data: empty content, language mismatch, and translationese at test time as well as distribution and domain biases in training; 3) usage and reporting: multi-reference support and model referencing in the literature. All of these problems imply that COMET scores are not comparable between papers or even technical setups and we put forward our perspective on fixing each issue.Furthermore, we release the sacreCOMET package that can generate a signature for the software and model configuration as well as an appropriate citation.The goal of this work is to help the community make more sound use of the COMET metric.</abstract>
<identifier type="citekey">zouhar-etal-2024-pitfalls</identifier>
<identifier type="doi">10.18653/v1/2024.wmt-1.121</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.121</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1272</start>
<end>1288</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pitfalls and Outlooks in Using COMET
%A Zouhar, Vilém
%A Chen, Pinzhen
%A Lam, Tsz Kin
%A Moghe, Nikita
%A Haddow, Barry
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F zouhar-etal-2024-pitfalls
%X The COMET metric has blazed a trail in the machine translation community, given its strong correlation with human judgements of translation quality.Its success stems from being a modified pre-trained multilingual model finetuned for quality assessment.However, it being a machine learning model also gives rise to a new set of pitfalls that may not be widely known. We investigate these unexpected behaviours from three aspects:1) technical: obsolete software versions and compute precision; 2) data: empty content, language mismatch, and translationese at test time as well as distribution and domain biases in training; 3) usage and reporting: multi-reference support and model referencing in the literature. All of these problems imply that COMET scores are not comparable between papers or even technical setups and we put forward our perspective on fixing each issue.Furthermore, we release the sacreCOMET package that can generate a signature for the software and model configuration as well as an appropriate citation.The goal of this work is to help the community make more sound use of the COMET metric.
%R 10.18653/v1/2024.wmt-1.121
%U https://aclanthology.org/2024.wmt-1.121
%U https://doi.org/10.18653/v1/2024.wmt-1.121
%P 1272-1288
Markdown (Informal)
[Pitfalls and Outlooks in Using COMET](https://aclanthology.org/2024.wmt-1.121) (Zouhar et al., WMT 2024)
ACL
- Vilém Zouhar, Pinzhen Chen, Tsz Kin Lam, Nikita Moghe, and Barry Haddow. 2024. Pitfalls and Outlooks in Using COMET. In Proceedings of the Ninth Conference on Machine Translation, pages 1272–1288, Miami, Florida, USA. Association for Computational Linguistics.