@inproceedings{lee-shin-2021-korean,
title = "The {K}orean Morphologically Tight-Fitting Tokenizer for Noisy User-Generated Texts",
author = "Lee, Sangah and
Shin, Hyopil",
editor = "Xu, Wei and
Ritter, Alan and
Baldwin, Tim and
Rahimi, Afshin",
booktitle = "Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wnut-1.45",
doi = "10.18653/v1/2021.wnut-1.45",
pages = "410--416",
abstract = "User-generated texts include various types of stylistic properties, or noises. Such texts are not properly processed by existing morpheme analyzers or language models based on formal texts such as encyclopedias or news articles. In this paper, we propose a simple morphologically tight-fitting tokenizer (K-MT) that can better process proper nouns, coinages, and internet slang among other types of noise in Korean user-generated texts. We tested our tokenizer by performing classification tasks on Korean user-generated movie reviews and hate speech datasets, and the Korean Named Entity Recognition dataset. Through our tests, we found that K-MT is better fit to process internet slangs, proper nouns, and coinages, compared to a morpheme analyzer and a character-level WordPiece tokenizer.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-shin-2021-korean">
<titleInfo>
<title>The Korean Morphologically Tight-Fitting Tokenizer for Noisy User-Generated Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sangah</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyopil</namePart>
<namePart type="family">Shin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afshin</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>User-generated texts include various types of stylistic properties, or noises. Such texts are not properly processed by existing morpheme analyzers or language models based on formal texts such as encyclopedias or news articles. In this paper, we propose a simple morphologically tight-fitting tokenizer (K-MT) that can better process proper nouns, coinages, and internet slang among other types of noise in Korean user-generated texts. We tested our tokenizer by performing classification tasks on Korean user-generated movie reviews and hate speech datasets, and the Korean Named Entity Recognition dataset. Through our tests, we found that K-MT is better fit to process internet slangs, proper nouns, and coinages, compared to a morpheme analyzer and a character-level WordPiece tokenizer.</abstract>
<identifier type="citekey">lee-shin-2021-korean</identifier>
<identifier type="doi">10.18653/v1/2021.wnut-1.45</identifier>
<location>
<url>https://aclanthology.org/2021.wnut-1.45</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>410</start>
<end>416</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Korean Morphologically Tight-Fitting Tokenizer for Noisy User-Generated Texts
%A Lee, Sangah
%A Shin, Hyopil
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%Y Rahimi, Afshin
%S Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F lee-shin-2021-korean
%X User-generated texts include various types of stylistic properties, or noises. Such texts are not properly processed by existing morpheme analyzers or language models based on formal texts such as encyclopedias or news articles. In this paper, we propose a simple morphologically tight-fitting tokenizer (K-MT) that can better process proper nouns, coinages, and internet slang among other types of noise in Korean user-generated texts. We tested our tokenizer by performing classification tasks on Korean user-generated movie reviews and hate speech datasets, and the Korean Named Entity Recognition dataset. Through our tests, we found that K-MT is better fit to process internet slangs, proper nouns, and coinages, compared to a morpheme analyzer and a character-level WordPiece tokenizer.
%R 10.18653/v1/2021.wnut-1.45
%U https://aclanthology.org/2021.wnut-1.45
%U https://doi.org/10.18653/v1/2021.wnut-1.45
%P 410-416
Markdown (Informal)
[The Korean Morphologically Tight-Fitting Tokenizer for Noisy User-Generated Texts](https://aclanthology.org/2021.wnut-1.45) (Lee & Shin, WNUT 2021)
ACL