@inproceedings{risch-etal-2021-toxic,
title = "Data Integration for Toxic Comment Classification: Making More Than 40 Datasets Easily Accessible in One Unified Format",
author = "Risch, Julian and
Schmidt, Philipp and
Krestel, Ralf",
editor = "Mostafazadeh Davani, Aida and
Kiela, Douwe and
Lambert, Mathias and
Vidgen, Bertie and
Prabhakaran, Vinodkumar and
Waseem, Zeerak",
booktitle = "Proceedings of the 5th Workshop on Online Abuse and Harms (WOAH 2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.woah-1.17",
doi = "10.18653/v1/2021.woah-1.17",
pages = "157--163",
abstract = "With the rise of research on toxic comment classification, more and more annotated datasets have been released. The wide variety of the task (different languages, different labeling processes and schemes) has led to a large amount of heterogeneous datasets that can be used for training and testing very specific settings. Despite recent efforts to create web pages that provide an overview, most publications still use only a single dataset. They are not stored in one central database, they come in many different data formats and it is difficult to interpret their class labels and how to reuse these labels in other projects. To overcome these issues, we present a collection of more than thirty datasets in the form of a software tool that automatizes downloading and processing of the data and presents them in a unified data format that also offers a mapping of compatible class labels. Another advantage of that tool is that it gives an overview of properties of available datasets, such as different languages, platforms, and class labels to make it easier to select suitable training and test data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="risch-etal-2021-toxic">
<titleInfo>
<title>Data Integration for Toxic Comment Classification: Making More Than 40 Datasets Easily Accessible in One Unified Format</title>
</titleInfo>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">Risch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Schmidt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ralf</namePart>
<namePart type="family">Krestel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Online Abuse and Harms (WOAH 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aida</namePart>
<namePart type="family">Mostafazadeh Davani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Douwe</namePart>
<namePart type="family">Kiela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathias</namePart>
<namePart type="family">Lambert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bertie</namePart>
<namePart type="family">Vidgen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vinodkumar</namePart>
<namePart type="family">Prabhakaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeerak</namePart>
<namePart type="family">Waseem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the rise of research on toxic comment classification, more and more annotated datasets have been released. The wide variety of the task (different languages, different labeling processes and schemes) has led to a large amount of heterogeneous datasets that can be used for training and testing very specific settings. Despite recent efforts to create web pages that provide an overview, most publications still use only a single dataset. They are not stored in one central database, they come in many different data formats and it is difficult to interpret their class labels and how to reuse these labels in other projects. To overcome these issues, we present a collection of more than thirty datasets in the form of a software tool that automatizes downloading and processing of the data and presents them in a unified data format that also offers a mapping of compatible class labels. Another advantage of that tool is that it gives an overview of properties of available datasets, such as different languages, platforms, and class labels to make it easier to select suitable training and test data.</abstract>
<identifier type="citekey">risch-etal-2021-toxic</identifier>
<identifier type="doi">10.18653/v1/2021.woah-1.17</identifier>
<location>
<url>https://aclanthology.org/2021.woah-1.17</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>157</start>
<end>163</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Integration for Toxic Comment Classification: Making More Than 40 Datasets Easily Accessible in One Unified Format
%A Risch, Julian
%A Schmidt, Philipp
%A Krestel, Ralf
%Y Mostafazadeh Davani, Aida
%Y Kiela, Douwe
%Y Lambert, Mathias
%Y Vidgen, Bertie
%Y Prabhakaran, Vinodkumar
%Y Waseem, Zeerak
%S Proceedings of the 5th Workshop on Online Abuse and Harms (WOAH 2021)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F risch-etal-2021-toxic
%X With the rise of research on toxic comment classification, more and more annotated datasets have been released. The wide variety of the task (different languages, different labeling processes and schemes) has led to a large amount of heterogeneous datasets that can be used for training and testing very specific settings. Despite recent efforts to create web pages that provide an overview, most publications still use only a single dataset. They are not stored in one central database, they come in many different data formats and it is difficult to interpret their class labels and how to reuse these labels in other projects. To overcome these issues, we present a collection of more than thirty datasets in the form of a software tool that automatizes downloading and processing of the data and presents them in a unified data format that also offers a mapping of compatible class labels. Another advantage of that tool is that it gives an overview of properties of available datasets, such as different languages, platforms, and class labels to make it easier to select suitable training and test data.
%R 10.18653/v1/2021.woah-1.17
%U https://aclanthology.org/2021.woah-1.17
%U https://doi.org/10.18653/v1/2021.woah-1.17
%P 157-163
Markdown (Informal)
[Data Integration for Toxic Comment Classification: Making More Than 40 Datasets Easily Accessible in One Unified Format](https://aclanthology.org/2021.woah-1.17) (Risch et al., WOAH 2021)
ACL