@inproceedings{masala-etal-2024-vorbesti,
title = "{``}Vorbe{\textcommabelow{s}}ti Rom{\^a}ne{\textcommabelow{s}}te?{''} A Recipe to Train Powerful {R}omanian {LLM}s with {E}nglish Instructions",
author = "Masala, Mihai and
Ilie-Ablachim, Denis and
Dima, Alexandru and
Corlatescu, Dragos Georgian and
Zavelca, Miruna-Andreea and
Olaru, Ovio and
Terian, Simina-Maria and
Terian, Andrei and
Leordeanu, Marius and
Velicu, Horia and
Popescu, Marius and
Dascalu, Mihai and
Rebedea, Traian",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.681",
doi = "10.18653/v1/2024.findings-emnlp.681",
pages = "11632--11647",
abstract = "In recent years, Large Language Models (LLMs) have achieved almost human-like performance on various tasks. While some LLMs have been trained on multilingual data, most of the training data is in English; hence, their performance in English greatly exceeds other languages. To our knowledge, we are the first to collect and translate a large collection of texts, instructions, and benchmarks and train, evaluate, and release open-source LLMs tailored for Romanian. We evaluate our methods on four different categories, including academic benchmarks, MT-Bench (manually translated), and a professionally built historical, cultural, and social benchmark adapted to Romanian. We argue for the usefulness and high performance of RoLLMs by obtaining state-of-the-art results across the board. We publicly release all resources (i.e., data, training and evaluation code, models) with the goal of supporting and encouraging research on Romanian LLMs while concurrently creating a generalizable recipe adequate for other low or less-resourced languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="masala-etal-2024-vorbesti">
<titleInfo>
<title>“Vorbe\textcommabelowsti Române\textcommabelowste?” A Recipe to Train Powerful Romanian LLMs with English Instructions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mihai</namePart>
<namePart type="family">Masala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="family">Ilie-Ablachim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandru</namePart>
<namePart type="family">Dima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dragos</namePart>
<namePart type="given">Georgian</namePart>
<namePart type="family">Corlatescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna-Andreea</namePart>
<namePart type="family">Zavelca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ovio</namePart>
<namePart type="family">Olaru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simina-Maria</namePart>
<namePart type="family">Terian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrei</namePart>
<namePart type="family">Terian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marius</namePart>
<namePart type="family">Leordeanu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Horia</namePart>
<namePart type="family">Velicu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marius</namePart>
<namePart type="family">Popescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihai</namePart>
<namePart type="family">Dascalu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Traian</namePart>
<namePart type="family">Rebedea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, Large Language Models (LLMs) have achieved almost human-like performance on various tasks. While some LLMs have been trained on multilingual data, most of the training data is in English; hence, their performance in English greatly exceeds other languages. To our knowledge, we are the first to collect and translate a large collection of texts, instructions, and benchmarks and train, evaluate, and release open-source LLMs tailored for Romanian. We evaluate our methods on four different categories, including academic benchmarks, MT-Bench (manually translated), and a professionally built historical, cultural, and social benchmark adapted to Romanian. We argue for the usefulness and high performance of RoLLMs by obtaining state-of-the-art results across the board. We publicly release all resources (i.e., data, training and evaluation code, models) with the goal of supporting and encouraging research on Romanian LLMs while concurrently creating a generalizable recipe adequate for other low or less-resourced languages.</abstract>
<identifier type="citekey">masala-etal-2024-vorbesti</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.681</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.681</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>11632</start>
<end>11647</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “Vorbe\textcommabelowsti Române\textcommabelowste?” A Recipe to Train Powerful Romanian LLMs with English Instructions
%A Masala, Mihai
%A Ilie-Ablachim, Denis
%A Dima, Alexandru
%A Corlatescu, Dragos Georgian
%A Zavelca, Miruna-Andreea
%A Olaru, Ovio
%A Terian, Simina-Maria
%A Terian, Andrei
%A Leordeanu, Marius
%A Velicu, Horia
%A Popescu, Marius
%A Dascalu, Mihai
%A Rebedea, Traian
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F masala-etal-2024-vorbesti
%X In recent years, Large Language Models (LLMs) have achieved almost human-like performance on various tasks. While some LLMs have been trained on multilingual data, most of the training data is in English; hence, their performance in English greatly exceeds other languages. To our knowledge, we are the first to collect and translate a large collection of texts, instructions, and benchmarks and train, evaluate, and release open-source LLMs tailored for Romanian. We evaluate our methods on four different categories, including academic benchmarks, MT-Bench (manually translated), and a professionally built historical, cultural, and social benchmark adapted to Romanian. We argue for the usefulness and high performance of RoLLMs by obtaining state-of-the-art results across the board. We publicly release all resources (i.e., data, training and evaluation code, models) with the goal of supporting and encouraging research on Romanian LLMs while concurrently creating a generalizable recipe adequate for other low or less-resourced languages.
%R 10.18653/v1/2024.findings-emnlp.681
%U https://aclanthology.org/2024.findings-emnlp.681
%U https://doi.org/10.18653/v1/2024.findings-emnlp.681
%P 11632-11647
Markdown (Informal)
[“Vorbești Românește?” A Recipe to Train Powerful Romanian LLMs with English Instructions](https://aclanthology.org/2024.findings-emnlp.681) (Masala et al., Findings 2024)
ACL
- Mihai Masala, Denis Ilie-Ablachim, Alexandru Dima, Dragos Georgian Corlatescu, Miruna-Andreea Zavelca, Ovio Olaru, Simina-Maria Terian, Andrei Terian, Marius Leordeanu, Horia Velicu, Marius Popescu, Mihai Dascalu, and Traian Rebedea. 2024. “Vorbești Românește?” A Recipe to Train Powerful Romanian LLMs with English Instructions. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 11632–11647, Miami, Florida, USA. Association for Computational Linguistics.