Efficient Memory Management for Large Language Model Serving with PagedAttention (2309.06180v1)

Published 12 Sep 2023 in cs.LG and cs.DC

Abstract: High throughput serving of LLMs requires batching sufficiently many requests at a time. However, existing systems struggle because the key-value cache (KV cache) memory for each request is huge and grows and shrinks dynamically. When managed inefficiently, this memory can be significantly wasted by fragmentation and redundant duplication, limiting the batch size. To address this problem, we propose PagedAttention, an attention algorithm inspired by the classical virtual memory and paging techniques in operating systems. On top of it, we build vLLM, an LLM serving system that achieves (1) near-zero waste in KV cache memory and (2) flexible sharing of KV cache within and across requests to further reduce memory usage. Our evaluations show that vLLM improves the throughput of popular LLMs by 2-4$\times$ with the same level of latency compared to the state-of-the-art systems, such as FasterTransformer and Orca. The improvement is more pronounced with longer sequences, larger models, and more complex decoding algorithms. vLLM's source code is publicly available at https://github.com/vLLM-project/vLLM

PDF Abstract

Here is a list of references cited in the paper with their corresponding bibliographic entries:

@article{vaswani2017attention, title={Attention is all you need}, author={Vaswani, Ashish and Shazeer, Noam and others}, journal={Advances in neural information processing systems}, volume={30}, year={2017}}
@article{paszke2019pytorch, title={Pytorch: An imperative style, high-performance deep learning library}, author={Paszke, Adam and Gross, Sam and others}, journal={Advances in neural information processing systems}, volume={32}, year={2019}}
@article{ba2016layer, title={Layer normalization}, author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E}, journal={arXiv preprint (Ba et al., 2016 )}, year={2016}}
@article{zhang2019root, title={Root mean square layer normalization}, author={Zhang, Biao and Sennrich, Rico}, journal={Advances in Neural Information Processing Systems}, volume={32}, year={2019}}
@article{su2021roformer, title={Roformer: Enhanced transformer with rotary position embedding}, author={Su, Jianlin and others}, journal={arXiv preprint (Su et al., 2021 )}, year={2021}}
@article{shazeer2020glu, title={Glu variants improve transformer}, author={Shazeer, Noam}, journal={arXiv preprint (Shazeer, 2020 )}, year={2020}}
@inproceedings{moritz2018ray, title={Ray: A distributed framework for emerging AI applications}, author={Moritz, Philipp and others}, booktitle={13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)}, pages={561--577}, year={2018}}
@inproceedings{wolf2020transformers, title={Transformers: State-of-the-art natural language processing}, author={Wolf, Thomas and others}, booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations}, pages={38--45}, year={2020}}
@article{touvron2023llama, title={Llama: Open and efficient foundation LLMs}, author={Touvron, Hugo and others}, journal={arXiv preprint (Touvron et al., 2023 )}, year={2023}}
@article{brown2020language, title={LLMs are few-shot learners}, author={Brown, Tom and others}, journal={Advances in neural information processing systems}, volume={33}, pages={1877--1901}, year={2020}}
@misc{openai2023gpt4, title={GPT-4 Technical Report}, author={OpenAI}, year={2023}, eprint={(OpenAI et al., 2023 )}, archivePrefix={arXiv}, primaryClass={cs.CL}}
@article{wei2022emergent, title={Emergent abilities of LLMs}, author={Wei, Jason and others}, journal={arXiv preprint (Wei et al., 2022 )}, year={2022}}
@article{scao2022bloom, title={Bloom: A 176b-parameter open-access multilingual LLM}, author={Scao, Teven Le and others}, journal={arXiv preprint (Workshop et al., 2022 )}, year={2022}}
@article{black2022gpt, title={Gpt-neox-20b: An open-source autoregressive LLM}, author={Black, Sid and others}, journal={arXiv preprint (Black et al., 2022 )}, year={2022}}
@article{bengio2000neural, title={A neural probabilistic LLM}, author={Bengio, Yoshua and others}, journal={Advances in neural information processing systems}, volume={13}, year={2000}}
@inproceedings{he2016deep, title={Deep residual learning for image recognition}, author={He, Kaiming and others}, booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, pages={770--778}, year={2016}}
@article{chowdhery2022palm, title={Palm: Scaling LLMing with pathways}, author={Chowdhery, Aakanksha and others}, journal={arXiv preprint (Chowdhery et al., 2022 )}, year={2022}}
@misc{openaiapi-pricing, title={OpenAI API Pricing}, author={OpenAI}, year={2023}}
@misc{chat-cost, title={Tech giants' AI-like Bing, Bard poses billion-dollar search problem}, author={Reuters}, year={2023}}
@misc{openaiapi, title={OpenAI API}, author={OpenAI}, year={2020}}
@misc{chatgpt, title={OpenAI ChatGPT}, author={OpenAI}, year={2022}}
@misc{chatgptuserprompt, title={Custom instructions for ChatGPT}, author={OpenAI}, year={2023}}
@misc{copilot, title={Github Copilot}, author={Github}, year={2022}}
@misc{bard, title={Google Bard}, author={Google}, year={2023}}
@misc{amazonbedrock, title={Amazon Bedrock}, author={Amazon Web Services}, year={2023}}
@article{wiseman2016sequence, title={Sequence-to-sequence learning as beam-search optimization}, author={Wiseman, Sam and Rush, Alexander M}, journal={arXiv preprint (Wiseman et al., 2016 )}, year={2016}}
@inproceedings{yu2022orca, title={Orca: A Distributed Serving System for Transformer-Based Generative Models}, author={Yu, Gyeong-In and others}, booktitle={16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)}, pages={521--538}, year={2022}}
@article{zhang2022opt, title={Opt: Open pre-trained transformer LLMs}, author={Zhang, Susan and others}, journal={arXiv preprint (Zhang et al., 2022 )}, year={2022}}
@article{sutskever2014sequence, title={Sequence to sequence learning with neural networks}, author={Sutskever, Ilya and others}, journal={Advances in neural information processing systems}, volume={27}, year={2014}}
@article{kilburn1962one, title={One-level storage system}, author={Kilburn, Tom and others}, journal={IRE Transactions on Electronic Computers}, number={2}, pages={223--235}, year={1962}, publisher={IEEE}}
@article{li2023alpaserve, title={AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving}, author={Li, Zhuohan and others}, journal={arXiv preprint (Li et al., 2023 )}, year={2023}}
@inproceedings{crankshaw2017clipper, title={Clipper: A Low-Latency Online Prediction Serving System}, author={Crankshaw, Daniel and others}, booktitle={14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)}, pages={613--627}, year={2017}}
@inproceedings{crankshaw2020inferline, title={InferLine: latency-aware provisioning and scaling for prediction serving pipelines}, author={Crankshaw, Daniel and others}, booktitle={Proceedings of the 11th ACM Symposium on Cloud Computing}, pages={477--491}, year={2020}}
@inproceedings{cui2022dvabatch, title={DVABatch: Diversity-aware Multi-Entry Multi-Exit Batching for Efficient Processing of DNN Services on GPUs}, author={Cui, Weihao and others}, booktitle={2022 USENIX Annual Technical Conference (USENIX ATC 22)}, pages={183--198}, year={2022}}
@inproceedings{zhou2022pets, title={PetS: A Unified Framework for Parameter-Efficient Transformers Serving}, author={Zhou, Zhe and others}, booktitle={2022 USENIX Annual Technical Conference (USENIX ATC 22)}, pages={489--504}, year={2022}}
@inproceedings{fang2021turbotransformers, title={TurboTransformers: an efficient GPU serving system for transformer models}, author={Fang, Jiarui and others}, booktitle={Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, pages={389--402}, year={2021}}
@article{olston2017tensorflow, title={Tensorflow-serving: Flexible, high-performance ml serving}, author={Olston, Christopher and others}, journal={arXiv preprint (Olston et al., 2017 )}, year={2017}}
@misc{nvidiatriton, title={Triton Inference Server}, author={NVIDIA}, year={2023}}
@misc{sharegpt, title={ShareGPT}, url={https://sharegpt.com/}, author={ShareGPT Team}, year={2023}}
@misc{alpaca, title={Stanford Alpaca: An Instruction-following LLaMA model}, author={Rohan Taori and others}, journal={GitHub repository}, year={2023}}
@article{wang2022self, title={Self-Instruct: Aligning LLM with Self Generated Instructions}, author={Wang, Yizhong and others}, journal={arXiv preprint (Wang et al., 2022 )}, year={2022}}
@misc{nvidiaft, title={FasterTransformer}, author={NVIDIA}, year={2023}}
@misc{lmsysweek8, title={Chatbot Arena Leaderboard Week 8: Introducing MT-Bench and Vicuna-33B}, author={LMSYS ORG}, year={2023}}
@misc{nccl, title={NCCL: The NVIDIA Collective Communication Library}, author={NVIDIA}, year={2023}}
@misc{fastapi, title={FastAPI}, year={2023}}
@article{jain2020checkmate, title={Checkmate: Breaking the memory wall with optimal tensor rematerialization}, author={Jain, Paras and others}, journal={Proceedings of Machine Learning and Systems}, volume={2}, pages={497--511}, year={2020}}
@article{chen2016training, title={Training deep nets with sublinear memory cost}, author={Chen, Tianqi and others}, journal={arXiv preprint (Chen et al., 2016 )}, year={2016}}
@inproceedings{ma2020rammer, title={Rammer: Enabling holistic deep learning compiler optimizations with rtasks}, author={Ma, Lingxiao and others}, booktitle={Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation}, pages={881--897}, year={2020}}
@article{steiner2022olla, title={OLLA: Optimizing the Lifetime and Location of Arrays to Reduce the Memory Usage of Neural Networks}, author={Steiner, Benoit and others}, year={2022}}
@article{rabe2021self, title={Self-attention Does Not Need $O(n^2)$ Memory}, author={Rabe, Markus N and Staats, Charles}, journal={arXiv preprint (Rabe et al., 2021 )}, year={2021}}