@inproceedings{rebeja-cristea-2020-dual,
title = "A dual-encoding system for dialect classification",
author = "Rebeja, Petru and
Cristea, Dan",
editor = {Zampieri, Marcos and
Nakov, Preslav and
Ljube{\v{s}}i{\'c}, Nikola and
Tiedemann, J{\"o}rg and
Scherrer, Yves},
booktitle = "Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics (ICCL)",
url = "https://aclanthology.org/2020.vardial-1.20",
pages = "212--219",
abstract = "In this paper we present the architecture, processing pipeline and results of the ensemble model developed for Romanian Dialect Identification task. The ensemble model consists of two TF-IDF encoders and a deep learning model aimed together at classifying input samples based on the writing patterns which are specific to each of the two dialects. Although the model performs well on the training set, its performance degrades heavily on the evaluation set. The drop in performance is due to the design decision which makes the model put too much weight on presence/lack of textual marks when determining the sample label.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rebeja-cristea-2020-dual">
<titleInfo>
<title>A dual-encoding system for dialect classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Petru</namePart>
<namePart type="family">Rebeja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Cristea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics (ICCL)</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we present the architecture, processing pipeline and results of the ensemble model developed for Romanian Dialect Identification task. The ensemble model consists of two TF-IDF encoders and a deep learning model aimed together at classifying input samples based on the writing patterns which are specific to each of the two dialects. Although the model performs well on the training set, its performance degrades heavily on the evaluation set. The drop in performance is due to the design decision which makes the model put too much weight on presence/lack of textual marks when determining the sample label.</abstract>
<identifier type="citekey">rebeja-cristea-2020-dual</identifier>
<location>
<url>https://aclanthology.org/2020.vardial-1.20</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>212</start>
<end>219</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A dual-encoding system for dialect classification
%A Rebeja, Petru
%A Cristea, Dan
%Y Zampieri, Marcos
%Y Nakov, Preslav
%Y Ljubešić, Nikola
%Y Tiedemann, Jörg
%Y Scherrer, Yves
%S Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects
%D 2020
%8 December
%I International Committee on Computational Linguistics (ICCL)
%C Barcelona, Spain (Online)
%F rebeja-cristea-2020-dual
%X In this paper we present the architecture, processing pipeline and results of the ensemble model developed for Romanian Dialect Identification task. The ensemble model consists of two TF-IDF encoders and a deep learning model aimed together at classifying input samples based on the writing patterns which are specific to each of the two dialects. Although the model performs well on the training set, its performance degrades heavily on the evaluation set. The drop in performance is due to the design decision which makes the model put too much weight on presence/lack of textual marks when determining the sample label.
%U https://aclanthology.org/2020.vardial-1.20
%P 212-219
Markdown (Informal)
[A dual-encoding system for dialect classification](https://aclanthology.org/2020.vardial-1.20) (Rebeja & Cristea, VarDial 2020)
ACL
- Petru Rebeja and Dan Cristea. 2020. A dual-encoding system for dialect classification. In Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects, pages 212–219, Barcelona, Spain (Online). International Committee on Computational Linguistics (ICCL).