@inproceedings{maheshwari-etal-2024-dictdis,
title = "{D}ict{D}is: Dictionary Constrained Disambiguation for Improved {NMT}",
author = "Maheshwari, Ayush and
Jyothi, Preethi and
Ramakrishnan, Ganesh",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.643",
doi = "10.18653/v1/2024.findings-emnlp.643",
pages = "10991--11004",
abstract = "Domain-specific neural machine translation (NMT) systems (, in educational applications) are socially significant with the potential to help make information accessible to a diverse set of users in multilingual societies. Such NMT systems should be lexically constrained and draw from domain-specific dictionaries. Dictionaries could present multiple candidate translations for a source word/phrase due to the polysemous nature of words. The onus is then on the NMT model to choose the contextually most appropriate candidate. Prior work has largely ignored this problem and focused on the single candidate constraint setting wherein the target word or phrase is replaced by a single constraint. In this work, we present DictDis, a lexically constrained NMT system that disambiguates between multiple candidate translations derived from dictionaries. We achieve this by augmenting training data with multiple dictionary candidates to actively encourage disambiguation during training by implicitly aligning multiple candidate constraints. We demonstrate the utility of DictDis via extensive experiments on English-Hindi, English-German, and English-French datasets across a variety of domains including regulatory, finance, engineering, health and standard benchmark test datasets. In comparison with existing approaches for lexically constrained and unconstrained NMT, we demonstrate superior performance for the copy constraint and disambiguation-related measures on all domains, while also obtaining improved fluency of up to 2-3 BLEU points on some domains. We also release our test set consisting of 4K English-Hindi sentences in multiple domains.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maheshwari-etal-2024-dictdis">
<titleInfo>
<title>DictDis: Dictionary Constrained Disambiguation for Improved NMT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ayush</namePart>
<namePart type="family">Maheshwari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preethi</namePart>
<namePart type="family">Jyothi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ganesh</namePart>
<namePart type="family">Ramakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Domain-specific neural machine translation (NMT) systems (, in educational applications) are socially significant with the potential to help make information accessible to a diverse set of users in multilingual societies. Such NMT systems should be lexically constrained and draw from domain-specific dictionaries. Dictionaries could present multiple candidate translations for a source word/phrase due to the polysemous nature of words. The onus is then on the NMT model to choose the contextually most appropriate candidate. Prior work has largely ignored this problem and focused on the single candidate constraint setting wherein the target word or phrase is replaced by a single constraint. In this work, we present DictDis, a lexically constrained NMT system that disambiguates between multiple candidate translations derived from dictionaries. We achieve this by augmenting training data with multiple dictionary candidates to actively encourage disambiguation during training by implicitly aligning multiple candidate constraints. We demonstrate the utility of DictDis via extensive experiments on English-Hindi, English-German, and English-French datasets across a variety of domains including regulatory, finance, engineering, health and standard benchmark test datasets. In comparison with existing approaches for lexically constrained and unconstrained NMT, we demonstrate superior performance for the copy constraint and disambiguation-related measures on all domains, while also obtaining improved fluency of up to 2-3 BLEU points on some domains. We also release our test set consisting of 4K English-Hindi sentences in multiple domains.</abstract>
<identifier type="citekey">maheshwari-etal-2024-dictdis</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.643</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.643</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>10991</start>
<end>11004</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DictDis: Dictionary Constrained Disambiguation for Improved NMT
%A Maheshwari, Ayush
%A Jyothi, Preethi
%A Ramakrishnan, Ganesh
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F maheshwari-etal-2024-dictdis
%X Domain-specific neural machine translation (NMT) systems (, in educational applications) are socially significant with the potential to help make information accessible to a diverse set of users in multilingual societies. Such NMT systems should be lexically constrained and draw from domain-specific dictionaries. Dictionaries could present multiple candidate translations for a source word/phrase due to the polysemous nature of words. The onus is then on the NMT model to choose the contextually most appropriate candidate. Prior work has largely ignored this problem and focused on the single candidate constraint setting wherein the target word or phrase is replaced by a single constraint. In this work, we present DictDis, a lexically constrained NMT system that disambiguates between multiple candidate translations derived from dictionaries. We achieve this by augmenting training data with multiple dictionary candidates to actively encourage disambiguation during training by implicitly aligning multiple candidate constraints. We demonstrate the utility of DictDis via extensive experiments on English-Hindi, English-German, and English-French datasets across a variety of domains including regulatory, finance, engineering, health and standard benchmark test datasets. In comparison with existing approaches for lexically constrained and unconstrained NMT, we demonstrate superior performance for the copy constraint and disambiguation-related measures on all domains, while also obtaining improved fluency of up to 2-3 BLEU points on some domains. We also release our test set consisting of 4K English-Hindi sentences in multiple domains.
%R 10.18653/v1/2024.findings-emnlp.643
%U https://aclanthology.org/2024.findings-emnlp.643
%U https://doi.org/10.18653/v1/2024.findings-emnlp.643
%P 10991-11004
Markdown (Informal)
[DictDis: Dictionary Constrained Disambiguation for Improved NMT](https://aclanthology.org/2024.findings-emnlp.643) (Maheshwari et al., Findings 2024)
ACL