@inproceedings{barnes-etal-2023-identifying,
title = "Identifying Token-Level Dialectal Features in Social Media",
author = "Barnes, Jeremy and
Touileb, Samia and
M{\ae}hlum, Petter and
Lison, Pierre",
editor = {Alum{\"a}e, Tanel and
Fishel, Mark},
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may,
year = "2023",
address = "T{\'o}rshavn, Faroe Islands",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2023.nodalida-1.16",
pages = "146--158",
abstract = "Dialectal variation is present in many human languages and is attracting a growing interest in NLP. Most previous work concentrated on either (1) classifying dialectal varieties at the document or sentence level or (2) performing standard NLP tasks on dialectal data. In this paper, we propose the novel task of token-level dialectal feature prediction. We present a set of fine-grained annotation guidelines for Norwegian dialects, expand a corpus of dialectal tweets, and manually annotate them using the introduced guidelines. Furthermore, to evaluate the learnability of our task, we conduct labeling experiments using a collection of baselines, weakly supervised and supervised sequence labeling models. The obtained results show that, despite the difficulty of the task and the scarcity of training data, many dialectal features can be predicted with reasonably high accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="barnes-etal-2023-identifying">
<titleInfo>
<title>Identifying Token-Level Dialectal Features in Social Media</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Barnes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petter</namePart>
<namePart type="family">Mæhlum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Lison</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tanel</namePart>
<namePart type="family">Alumäe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Fishel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tórshavn, Faroe Islands</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Dialectal variation is present in many human languages and is attracting a growing interest in NLP. Most previous work concentrated on either (1) classifying dialectal varieties at the document or sentence level or (2) performing standard NLP tasks on dialectal data. In this paper, we propose the novel task of token-level dialectal feature prediction. We present a set of fine-grained annotation guidelines for Norwegian dialects, expand a corpus of dialectal tweets, and manually annotate them using the introduced guidelines. Furthermore, to evaluate the learnability of our task, we conduct labeling experiments using a collection of baselines, weakly supervised and supervised sequence labeling models. The obtained results show that, despite the difficulty of the task and the scarcity of training data, many dialectal features can be predicted with reasonably high accuracy.</abstract>
<identifier type="citekey">barnes-etal-2023-identifying</identifier>
<location>
<url>https://aclanthology.org/2023.nodalida-1.16</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>146</start>
<end>158</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Identifying Token-Level Dialectal Features in Social Media
%A Barnes, Jeremy
%A Touileb, Samia
%A Mæhlum, Petter
%A Lison, Pierre
%Y Alumäe, Tanel
%Y Fishel, Mark
%S Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)
%D 2023
%8 May
%I University of Tartu Library
%C Tórshavn, Faroe Islands
%F barnes-etal-2023-identifying
%X Dialectal variation is present in many human languages and is attracting a growing interest in NLP. Most previous work concentrated on either (1) classifying dialectal varieties at the document or sentence level or (2) performing standard NLP tasks on dialectal data. In this paper, we propose the novel task of token-level dialectal feature prediction. We present a set of fine-grained annotation guidelines for Norwegian dialects, expand a corpus of dialectal tweets, and manually annotate them using the introduced guidelines. Furthermore, to evaluate the learnability of our task, we conduct labeling experiments using a collection of baselines, weakly supervised and supervised sequence labeling models. The obtained results show that, despite the difficulty of the task and the scarcity of training data, many dialectal features can be predicted with reasonably high accuracy.
%U https://aclanthology.org/2023.nodalida-1.16
%P 146-158
Markdown (Informal)
[Identifying Token-Level Dialectal Features in Social Media](https://aclanthology.org/2023.nodalida-1.16) (Barnes et al., NoDaLiDa 2023)
ACL