@inproceedings{marinova-etal-2023-transformer,
title = "Transformer-Based Language Models for {B}ulgarian",
author = "Marinova, Iva and
Simov, Kiril and
Osenova, Petya",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.ranlp-1.77",
pages = "712--720",
abstract = "This paper presents an approach for training lightweight and robust language models for Bulgarian that mitigate gender, political, racial, and other biases in the data. Our method involves scraping content from major Bulgarian online media providers using a specialized procedure for source filtering, topic selection, and lexicon-based removal of inappropriate language during the pre-training phase. We continuously improve the models by incorporating new data from various domains, including social media, books, scientific literature, and linguistically modified corpora. Our motivation is to provide a solution that is sufficient for all natural language processing tasks in Bulgarian, and to address the lack of existing procedures for guaranteeing the robustness of such models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="marinova-etal-2023-transformer">
<titleInfo>
<title>Transformer-Based Language Models for Bulgarian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iva</namePart>
<namePart type="family">Marinova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiril</namePart>
<namePart type="family">Simov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petya</namePart>
<namePart type="family">Osenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents an approach for training lightweight and robust language models for Bulgarian that mitigate gender, political, racial, and other biases in the data. Our method involves scraping content from major Bulgarian online media providers using a specialized procedure for source filtering, topic selection, and lexicon-based removal of inappropriate language during the pre-training phase. We continuously improve the models by incorporating new data from various domains, including social media, books, scientific literature, and linguistically modified corpora. Our motivation is to provide a solution that is sufficient for all natural language processing tasks in Bulgarian, and to address the lack of existing procedures for guaranteeing the robustness of such models.</abstract>
<identifier type="citekey">marinova-etal-2023-transformer</identifier>
<location>
<url>https://aclanthology.org/2023.ranlp-1.77</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>712</start>
<end>720</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transformer-Based Language Models for Bulgarian
%A Marinova, Iva
%A Simov, Kiril
%A Osenova, Petya
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F marinova-etal-2023-transformer
%X This paper presents an approach for training lightweight and robust language models for Bulgarian that mitigate gender, political, racial, and other biases in the data. Our method involves scraping content from major Bulgarian online media providers using a specialized procedure for source filtering, topic selection, and lexicon-based removal of inappropriate language during the pre-training phase. We continuously improve the models by incorporating new data from various domains, including social media, books, scientific literature, and linguistically modified corpora. Our motivation is to provide a solution that is sufficient for all natural language processing tasks in Bulgarian, and to address the lack of existing procedures for guaranteeing the robustness of such models.
%U https://aclanthology.org/2023.ranlp-1.77
%P 712-720
Markdown (Informal)
[Transformer-Based Language Models for Bulgarian](https://aclanthology.org/2023.ranlp-1.77) (Marinova et al., RANLP 2023)
ACL
- Iva Marinova, Kiril Simov, and Petya Osenova. 2023. Transformer-Based Language Models for Bulgarian. In Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing, pages 712–720, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.