@inproceedings{bespalov-etal-2023-towards,
title = "Towards Building a Robust Toxicity Predictor",
author = "Bespalov, Dmitriy and
Bhabesh, Sourav and
Xiang, Yi and
Zhou, Liutong and
Qi, Yanjun",
editor = "Sitaram, Sunayana and
Beigman Klebanov, Beata and
Williams, Jason D",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-industry.56",
doi = "10.18653/v1/2023.acl-industry.56",
pages = "581--598",
abstract = "Recent NLP literature pays little attention to the robustness of toxicity language predictors, while these systems are most likely to be used in adversarial contexts. This paper presents a novel adversarial attack, {\textbackslash}texttt{ToxicTrap}, introducing small word-level perturbations to fool SOTA text classifiers to predict toxic text samples as benign. {\textbackslash}texttt{ToxicTrap} exploits greedy based search strategies to enable fast and effective generation of toxic adversarial examples. Two novel goal function designs allow {\textbackslash}texttt{ToxicTrap} to identify weaknesses in both multiclass and multilabel toxic language detectors. Our empirical results show that SOTA toxicity text classifiers are indeed vulnerable to the proposed attacks, attaining over 98{\textbackslash}{\%} attack success rates in multilabel cases. We also show how a vanilla adversarial training and its improved version can help increase robustness of a toxicity detector even against unseen attacks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bespalov-etal-2023-towards">
<titleInfo>
<title>Towards Building a Robust Toxicity Predictor</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dmitriy</namePart>
<namePart type="family">Bespalov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sourav</namePart>
<namePart type="family">Bhabesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liutong</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanjun</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sunayana</namePart>
<namePart type="family">Sitaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beata</namePart>
<namePart type="family">Beigman Klebanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent NLP literature pays little attention to the robustness of toxicity language predictors, while these systems are most likely to be used in adversarial contexts. This paper presents a novel adversarial attack, \textbackslashtextttToxicTrap, introducing small word-level perturbations to fool SOTA text classifiers to predict toxic text samples as benign. \textbackslashtextttToxicTrap exploits greedy based search strategies to enable fast and effective generation of toxic adversarial examples. Two novel goal function designs allow \textbackslashtextttToxicTrap to identify weaknesses in both multiclass and multilabel toxic language detectors. Our empirical results show that SOTA toxicity text classifiers are indeed vulnerable to the proposed attacks, attaining over 98\textbackslash% attack success rates in multilabel cases. We also show how a vanilla adversarial training and its improved version can help increase robustness of a toxicity detector even against unseen attacks.</abstract>
<identifier type="citekey">bespalov-etal-2023-towards</identifier>
<identifier type="doi">10.18653/v1/2023.acl-industry.56</identifier>
<location>
<url>https://aclanthology.org/2023.acl-industry.56</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>581</start>
<end>598</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Building a Robust Toxicity Predictor
%A Bespalov, Dmitriy
%A Bhabesh, Sourav
%A Xiang, Yi
%A Zhou, Liutong
%A Qi, Yanjun
%Y Sitaram, Sunayana
%Y Beigman Klebanov, Beata
%Y Williams, Jason D.
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F bespalov-etal-2023-towards
%X Recent NLP literature pays little attention to the robustness of toxicity language predictors, while these systems are most likely to be used in adversarial contexts. This paper presents a novel adversarial attack, \textbackslashtextttToxicTrap, introducing small word-level perturbations to fool SOTA text classifiers to predict toxic text samples as benign. \textbackslashtextttToxicTrap exploits greedy based search strategies to enable fast and effective generation of toxic adversarial examples. Two novel goal function designs allow \textbackslashtextttToxicTrap to identify weaknesses in both multiclass and multilabel toxic language detectors. Our empirical results show that SOTA toxicity text classifiers are indeed vulnerable to the proposed attacks, attaining over 98\textbackslash% attack success rates in multilabel cases. We also show how a vanilla adversarial training and its improved version can help increase robustness of a toxicity detector even against unseen attacks.
%R 10.18653/v1/2023.acl-industry.56
%U https://aclanthology.org/2023.acl-industry.56
%U https://doi.org/10.18653/v1/2023.acl-industry.56
%P 581-598
Markdown (Informal)
[Towards Building a Robust Toxicity Predictor](https://aclanthology.org/2023.acl-industry.56) (Bespalov et al., ACL 2023)
ACL
- Dmitriy Bespalov, Sourav Bhabesh, Yi Xiang, Liutong Zhou, and Yanjun Qi. 2023. Towards Building a Robust Toxicity Predictor. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track), pages 581–598, Toronto, Canada. Association for Computational Linguistics.