@inproceedings{esalati-etal-2024-esposito,
title = "Esposito: An {E}nglish-{P}ersian Scientific Parallel Corpus for Machine Translation",
author = "Esalati, Mersad and
Dousti, Mohammad Javad and
Faili, Heshaam",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.557",
pages = "6299--6308",
abstract = "Neural machine translation requires large number of parallel sentences along with in-domain parallel data to attain best results. Nevertheless, no scientific parallel corpus for English-Persian language pair is available. In this paper, a parallel corpus called Esposito is introduced, which contains 3.5 million parallel sentences in the scientific domain for English-Persian language pair. In addition, we present a manually validated scientific test set that might serve as a baseline for future studies. We show that a system trained using Esposito along with other publicly available data improves the baseline on average by 7.6 and 8.4 BLEU scores for En-{\textgreater}Fa and Fa-{\textgreater}En directions, respectively. Additionally, domain analysis using the 5-gram KenLM model revealed notable distinctions between our parallel corpus and the existing generic parallel corpus. This dataset will be available to the public upon the acceptance of the paper.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="esalati-etal-2024-esposito">
<titleInfo>
<title>Esposito: An English-Persian Scientific Parallel Corpus for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mersad</namePart>
<namePart type="family">Esalati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Javad</namePart>
<namePart type="family">Dousti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heshaam</namePart>
<namePart type="family">Faili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural machine translation requires large number of parallel sentences along with in-domain parallel data to attain best results. Nevertheless, no scientific parallel corpus for English-Persian language pair is available. In this paper, a parallel corpus called Esposito is introduced, which contains 3.5 million parallel sentences in the scientific domain for English-Persian language pair. In addition, we present a manually validated scientific test set that might serve as a baseline for future studies. We show that a system trained using Esposito along with other publicly available data improves the baseline on average by 7.6 and 8.4 BLEU scores for En-\textgreaterFa and Fa-\textgreaterEn directions, respectively. Additionally, domain analysis using the 5-gram KenLM model revealed notable distinctions between our parallel corpus and the existing generic parallel corpus. This dataset will be available to the public upon the acceptance of the paper.</abstract>
<identifier type="citekey">esalati-etal-2024-esposito</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.557</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>6299</start>
<end>6308</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Esposito: An English-Persian Scientific Parallel Corpus for Machine Translation
%A Esalati, Mersad
%A Dousti, Mohammad Javad
%A Faili, Heshaam
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F esalati-etal-2024-esposito
%X Neural machine translation requires large number of parallel sentences along with in-domain parallel data to attain best results. Nevertheless, no scientific parallel corpus for English-Persian language pair is available. In this paper, a parallel corpus called Esposito is introduced, which contains 3.5 million parallel sentences in the scientific domain for English-Persian language pair. In addition, we present a manually validated scientific test set that might serve as a baseline for future studies. We show that a system trained using Esposito along with other publicly available data improves the baseline on average by 7.6 and 8.4 BLEU scores for En-\textgreaterFa and Fa-\textgreaterEn directions, respectively. Additionally, domain analysis using the 5-gram KenLM model revealed notable distinctions between our parallel corpus and the existing generic parallel corpus. This dataset will be available to the public upon the acceptance of the paper.
%U https://aclanthology.org/2024.lrec-main.557
%P 6299-6308
Markdown (Informal)
[Esposito: An English-Persian Scientific Parallel Corpus for Machine Translation](https://aclanthology.org/2024.lrec-main.557) (Esalati et al., LREC-COLING 2024)
ACL