@inproceedings{yin-etal-2024-lexmatcher,
title = "{L}ex{M}atcher: Dictionary-centric Data Curation for {LLM}-based Machine Translation",
author = "Yin, Yongjing and
Zeng, Jiali and
Li, Yafu and
Meng, Fandong and
Zhang, Yue",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.866",
pages = "14767--14779",
abstract = "The fine-tuning of open-source large language models (LLMs) for machine translation has recently received considerable attention, marking a shift towards data-centric research from traditional neural machine translation. However, the area of data collection for instruction fine-tuning in machine translation remains relatively underexplored. In this paper, we present LexMatcher, a simple yet effective method for data curation,the design of which is driven by the coverage of senses found in bilingual dictionaries. The construction process comprises data retrieval from an existing corpus and data augmentation that supplements the infrequent senses of polysemous words. Utilizing LLaMA2 as our base model, our method outperforms the established baselines on the WMT2022 test sets and also exhibits remarkable performance in tasks related to word sense disambiguation and specialized terminology translation. Our method is also applicable to other pre-trained models, and complements the method of continual pre-training using monolingual data, demonstrating the effectiveness of LexMatcher in enhancing LLM-based machine translation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yin-etal-2024-lexmatcher">
<titleInfo>
<title>LexMatcher: Dictionary-centric Data Curation for LLM-based Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yongjing</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiali</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yafu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fandong</namePart>
<namePart type="family">Meng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The fine-tuning of open-source large language models (LLMs) for machine translation has recently received considerable attention, marking a shift towards data-centric research from traditional neural machine translation. However, the area of data collection for instruction fine-tuning in machine translation remains relatively underexplored. In this paper, we present LexMatcher, a simple yet effective method for data curation,the design of which is driven by the coverage of senses found in bilingual dictionaries. The construction process comprises data retrieval from an existing corpus and data augmentation that supplements the infrequent senses of polysemous words. Utilizing LLaMA2 as our base model, our method outperforms the established baselines on the WMT2022 test sets and also exhibits remarkable performance in tasks related to word sense disambiguation and specialized terminology translation. Our method is also applicable to other pre-trained models, and complements the method of continual pre-training using monolingual data, demonstrating the effectiveness of LexMatcher in enhancing LLM-based machine translation.</abstract>
<identifier type="citekey">yin-etal-2024-lexmatcher</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.866</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>14767</start>
<end>14779</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LexMatcher: Dictionary-centric Data Curation for LLM-based Machine Translation
%A Yin, Yongjing
%A Zeng, Jiali
%A Li, Yafu
%A Meng, Fandong
%A Zhang, Yue
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F yin-etal-2024-lexmatcher
%X The fine-tuning of open-source large language models (LLMs) for machine translation has recently received considerable attention, marking a shift towards data-centric research from traditional neural machine translation. However, the area of data collection for instruction fine-tuning in machine translation remains relatively underexplored. In this paper, we present LexMatcher, a simple yet effective method for data curation,the design of which is driven by the coverage of senses found in bilingual dictionaries. The construction process comprises data retrieval from an existing corpus and data augmentation that supplements the infrequent senses of polysemous words. Utilizing LLaMA2 as our base model, our method outperforms the established baselines on the WMT2022 test sets and also exhibits remarkable performance in tasks related to word sense disambiguation and specialized terminology translation. Our method is also applicable to other pre-trained models, and complements the method of continual pre-training using monolingual data, demonstrating the effectiveness of LexMatcher in enhancing LLM-based machine translation.
%U https://aclanthology.org/2024.findings-emnlp.866
%P 14767-14779
Markdown (Informal)
[LexMatcher: Dictionary-centric Data Curation for LLM-based Machine Translation](https://aclanthology.org/2024.findings-emnlp.866) (Yin et al., Findings 2024)
ACL