@inproceedings{jeon-etal-2023-frustratingly,
title = "A Frustratingly Easy Post-Training Quantization Scheme for {LLM}s",
author = "Jeon, Yongkweon and
Lee, Chungman and
Park, Kyungphil and
Kim, Ho-young",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.892",
doi = "10.18653/v1/2023.emnlp-main.892",
pages = "14446--14461",
abstract = "Efficient inference has become crucial for hyper-scale AI models, including large language models, as their parameter count continues to increase for enhanced performance. This necessity holds true regardless of the computing environment, whether it be mobile devices or cloud servers. Quantization emerges as a solution to alleviate the computational burden during inference. By representing models with a reduced bit-width, quantization minimizes the frequency of DRAM access while fully exploiting the parallelism of operations through a dense matrix format. Consequently, quantized models achieve low end-to-end latency and optimize resource utilization by addressing both memory and computing bottlenecks. In this paper, we propose a straightforward post-training quantization scheme, called Z-Fold, that fully utilizes the feature of the Transformer structure widely employed in large language models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jeon-etal-2023-frustratingly">
<titleInfo>
<title>A Frustratingly Easy Post-Training Quantization Scheme for LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yongkweon</namePart>
<namePart type="family">Jeon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chungman</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyungphil</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ho-young</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Efficient inference has become crucial for hyper-scale AI models, including large language models, as their parameter count continues to increase for enhanced performance. This necessity holds true regardless of the computing environment, whether it be mobile devices or cloud servers. Quantization emerges as a solution to alleviate the computational burden during inference. By representing models with a reduced bit-width, quantization minimizes the frequency of DRAM access while fully exploiting the parallelism of operations through a dense matrix format. Consequently, quantized models achieve low end-to-end latency and optimize resource utilization by addressing both memory and computing bottlenecks. In this paper, we propose a straightforward post-training quantization scheme, called Z-Fold, that fully utilizes the feature of the Transformer structure widely employed in large language models.</abstract>
<identifier type="citekey">jeon-etal-2023-frustratingly</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.892</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.892</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>14446</start>
<end>14461</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Frustratingly Easy Post-Training Quantization Scheme for LLMs
%A Jeon, Yongkweon
%A Lee, Chungman
%A Park, Kyungphil
%A Kim, Ho-young
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F jeon-etal-2023-frustratingly
%X Efficient inference has become crucial for hyper-scale AI models, including large language models, as their parameter count continues to increase for enhanced performance. This necessity holds true regardless of the computing environment, whether it be mobile devices or cloud servers. Quantization emerges as a solution to alleviate the computational burden during inference. By representing models with a reduced bit-width, quantization minimizes the frequency of DRAM access while fully exploiting the parallelism of operations through a dense matrix format. Consequently, quantized models achieve low end-to-end latency and optimize resource utilization by addressing both memory and computing bottlenecks. In this paper, we propose a straightforward post-training quantization scheme, called Z-Fold, that fully utilizes the feature of the Transformer structure widely employed in large language models.
%R 10.18653/v1/2023.emnlp-main.892
%U https://aclanthology.org/2023.emnlp-main.892
%U https://doi.org/10.18653/v1/2023.emnlp-main.892
%P 14446-14461
Markdown (Informal)
[A Frustratingly Easy Post-Training Quantization Scheme for LLMs](https://aclanthology.org/2023.emnlp-main.892) (Jeon et al., EMNLP 2023)
ACL