@inproceedings{khurana-etal-2021-emotionally,
title = "How Emotionally Stable is {ALBERT}? Testing Robustness with Stochastic Weight Averaging on a Sentiment Analysis Task",
author = "Khurana, Urja and
Nalisnick, Eric and
Fokkens, Antske",
editor = "Gao, Yang and
Eger, Steffen and
Zhao, Wei and
Lertvittayakumjorn, Piyawat and
Fomicheva, Marina",
booktitle = "Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eval4nlp-1.3",
doi = "10.18653/v1/2021.eval4nlp-1.3",
pages = "16--31",
abstract = "Despite their success, modern language models are fragile. Even small changes in their training pipeline can lead to unexpected results. We study this phenomenon by examining the robustness of ALBERT (Lan et al., 2020) in combination with Stochastic Weight Averaging (SWA){---}a cheap way of ensembling{---}on a sentiment analysis task (SST-2). In particular, we analyze SWA{'}s stability via CheckList criteria (Ribeiro et al., 2020), examining the agreement on errors made by models differing only in their random seed. We hypothesize that SWA is more stable because it ensembles model snapshots taken along the gradient descent trajectory. We quantify stability by comparing the models{'} mistakes with Fleiss{'} Kappa (Fleiss, 1971) and overlap ratio scores. We find that SWA reduces error rates in general; yet the models still suffer from their own distinct biases (according to CheckList).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khurana-etal-2021-emotionally">
<titleInfo>
<title>How Emotionally Stable is ALBERT? Testing Robustness with Stochastic Weight Averaging on a Sentiment Analysis Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Urja</namePart>
<namePart type="family">Khurana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Nalisnick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antske</namePart>
<namePart type="family">Fokkens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piyawat</namePart>
<namePart type="family">Lertvittayakumjorn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marina</namePart>
<namePart type="family">Fomicheva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite their success, modern language models are fragile. Even small changes in their training pipeline can lead to unexpected results. We study this phenomenon by examining the robustness of ALBERT (Lan et al., 2020) in combination with Stochastic Weight Averaging (SWA)—a cheap way of ensembling—on a sentiment analysis task (SST-2). In particular, we analyze SWA’s stability via CheckList criteria (Ribeiro et al., 2020), examining the agreement on errors made by models differing only in their random seed. We hypothesize that SWA is more stable because it ensembles model snapshots taken along the gradient descent trajectory. We quantify stability by comparing the models’ mistakes with Fleiss’ Kappa (Fleiss, 1971) and overlap ratio scores. We find that SWA reduces error rates in general; yet the models still suffer from their own distinct biases (according to CheckList).</abstract>
<identifier type="citekey">khurana-etal-2021-emotionally</identifier>
<identifier type="doi">10.18653/v1/2021.eval4nlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2021.eval4nlp-1.3</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>16</start>
<end>31</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Emotionally Stable is ALBERT? Testing Robustness with Stochastic Weight Averaging on a Sentiment Analysis Task
%A Khurana, Urja
%A Nalisnick, Eric
%A Fokkens, Antske
%Y Gao, Yang
%Y Eger, Steffen
%Y Zhao, Wei
%Y Lertvittayakumjorn, Piyawat
%Y Fomicheva, Marina
%S Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F khurana-etal-2021-emotionally
%X Despite their success, modern language models are fragile. Even small changes in their training pipeline can lead to unexpected results. We study this phenomenon by examining the robustness of ALBERT (Lan et al., 2020) in combination with Stochastic Weight Averaging (SWA)—a cheap way of ensembling—on a sentiment analysis task (SST-2). In particular, we analyze SWA’s stability via CheckList criteria (Ribeiro et al., 2020), examining the agreement on errors made by models differing only in their random seed. We hypothesize that SWA is more stable because it ensembles model snapshots taken along the gradient descent trajectory. We quantify stability by comparing the models’ mistakes with Fleiss’ Kappa (Fleiss, 1971) and overlap ratio scores. We find that SWA reduces error rates in general; yet the models still suffer from their own distinct biases (according to CheckList).
%R 10.18653/v1/2021.eval4nlp-1.3
%U https://aclanthology.org/2021.eval4nlp-1.3
%U https://doi.org/10.18653/v1/2021.eval4nlp-1.3
%P 16-31
Markdown (Informal)
[How Emotionally Stable is ALBERT? Testing Robustness with Stochastic Weight Averaging on a Sentiment Analysis Task](https://aclanthology.org/2021.eval4nlp-1.3) (Khurana et al., Eval4NLP 2021)
ACL