@inproceedings{steffen-van-genabith-2021-transins,
title = "{T}rans{I}ns: Document Translation with Markup Reinsertion",
author = {Steffen, J{\"o}rg and
van Genabith, Josef},
editor = "Adel, Heike and
Shi, Shuming",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-demo.4",
doi = "10.18653/v1/2021.emnlp-demo.4",
pages = "28--34",
abstract = "For many use cases, it is required that MT does not just translate raw text, but complex formatted documents (e.g. websites, slides, spreadsheets) and the result of the translation should reflect the formatting. This is challenging, as markup can be nested, apply to spans contiguous in source but non-contiguous in target etc. Here we present TransIns, a system for non-plain text document translation that builds on the Okapi framework and MT models trained with Marian NMT. We develop, implement and evaluate different strategies for reinserting markup into translated sentences using token alignments between source and target sentences. We propose a simple and effective strategy that compiles down all markup to single source tokens and transfers them to aligned target tokens. A first evaluation shows that this strategy yields highly accurate markup in the translated documents that outperforms the markup quality found in documents translated with popular translation services. We release TransIns under the MIT License as open-source software on \url{https://github.com/DFKI-MLT/TransIns}. An online demonstrator is available at \url{https://transins.dfki.de}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="steffen-van-genabith-2021-transins">
<titleInfo>
<title>TransIns: Document Translation with Markup Reinsertion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Steffen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">van Genabith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heike</namePart>
<namePart type="family">Adel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuming</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>For many use cases, it is required that MT does not just translate raw text, but complex formatted documents (e.g. websites, slides, spreadsheets) and the result of the translation should reflect the formatting. This is challenging, as markup can be nested, apply to spans contiguous in source but non-contiguous in target etc. Here we present TransIns, a system for non-plain text document translation that builds on the Okapi framework and MT models trained with Marian NMT. We develop, implement and evaluate different strategies for reinserting markup into translated sentences using token alignments between source and target sentences. We propose a simple and effective strategy that compiles down all markup to single source tokens and transfers them to aligned target tokens. A first evaluation shows that this strategy yields highly accurate markup in the translated documents that outperforms the markup quality found in documents translated with popular translation services. We release TransIns under the MIT License as open-source software on https://github.com/DFKI-MLT/TransIns. An online demonstrator is available at https://transins.dfki.de.</abstract>
<identifier type="citekey">steffen-van-genabith-2021-transins</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-demo.4</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-demo.4</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>28</start>
<end>34</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TransIns: Document Translation with Markup Reinsertion
%A Steffen, Jörg
%A van Genabith, Josef
%Y Adel, Heike
%Y Shi, Shuming
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F steffen-van-genabith-2021-transins
%X For many use cases, it is required that MT does not just translate raw text, but complex formatted documents (e.g. websites, slides, spreadsheets) and the result of the translation should reflect the formatting. This is challenging, as markup can be nested, apply to spans contiguous in source but non-contiguous in target etc. Here we present TransIns, a system for non-plain text document translation that builds on the Okapi framework and MT models trained with Marian NMT. We develop, implement and evaluate different strategies for reinserting markup into translated sentences using token alignments between source and target sentences. We propose a simple and effective strategy that compiles down all markup to single source tokens and transfers them to aligned target tokens. A first evaluation shows that this strategy yields highly accurate markup in the translated documents that outperforms the markup quality found in documents translated with popular translation services. We release TransIns under the MIT License as open-source software on https://github.com/DFKI-MLT/TransIns. An online demonstrator is available at https://transins.dfki.de.
%R 10.18653/v1/2021.emnlp-demo.4
%U https://aclanthology.org/2021.emnlp-demo.4
%U https://doi.org/10.18653/v1/2021.emnlp-demo.4
%P 28-34
Markdown (Informal)
[TransIns: Document Translation with Markup Reinsertion](https://aclanthology.org/2021.emnlp-demo.4) (Steffen & van Genabith, EMNLP 2021)
ACL
- Jörg Steffen and Josef van Genabith. 2021. TransIns: Document Translation with Markup Reinsertion. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 28–34, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.