Here is a list of references cited in the paper with their corresponding bibliographic entries:
- @article{vaswani2017attention, title={Attention is all you need}, author={Vaswani, Ashish and Shazeer, Noam and others}, journal={Advances in neural information processing systems}, volume={30}, year={2017}}
- @article{paszke2019pytorch, title={Pytorch: An imperative style, high-performance deep learning library}, author={Paszke, Adam and Gross, Sam and others}, journal={Advances in neural information processing systems}, volume={32}, year={2019}}
- @article{ba2016layer, title={Layer normalization}, author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E}, journal={arXiv preprint (Ba et al., 2016 )}, year={2016}}
- @article{zhang2019root, title={Root mean square layer normalization}, author={Zhang, Biao and Sennrich, Rico}, journal={Advances in Neural Information Processing Systems}, volume={32}, year={2019}}
- @article{su2021roformer, title={Roformer: Enhanced transformer with rotary position embedding}, author={Su, Jianlin and others}, journal={arXiv preprint (Su et al., 2021 )}, year={2021}}
- @article{shazeer2020glu, title={Glu variants improve transformer}, author={Shazeer, Noam}, journal={arXiv preprint (Shazeer, 2020 )}, year={2020}}
- @inproceedings{moritz2018ray, title={Ray: A distributed framework for emerging AI applications}, author={Moritz, Philipp and others}, booktitle={13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)}, pages={561--577}, year={2018}}
- @inproceedings{wolf2020transformers, title={Transformers: State-of-the-art natural language processing}, author={Wolf, Thomas and others}, booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations}, pages={38--45}, year={2020}}
- @article{touvron2023llama, title={Llama: Open and efficient foundation LLMs}, author={Touvron, Hugo and others}, journal={arXiv preprint (Touvron et al., 2023 )}, year={2023}}
- @article{brown2020language, title={LLMs are few-shot learners}, author={Brown, Tom and others}, journal={Advances in neural information processing systems}, volume={33}, pages={1877--1901}, year={2020}}
- @misc{openai2023gpt4, title={GPT-4 Technical Report}, author={OpenAI}, year={2023}, eprint={(OpenAI et al., 2023 )}, archivePrefix={arXiv}, primaryClass={cs.CL}}
- @article{wei2022emergent, title={Emergent abilities of LLMs}, author={Wei, Jason and others}, journal={arXiv preprint (Wei et al., 2022 )}, year={2022}}
- @article{scao2022bloom, title={Bloom: A 176b-parameter open-access multilingual LLM}, author={Scao, Teven Le and others}, journal={arXiv preprint (Workshop et al., 2022 )}, year={2022}}
- @article{black2022gpt, title={Gpt-neox-20b: An open-source autoregressive LLM}, author={Black, Sid and others}, journal={arXiv preprint (Black et al., 2022 )}, year={2022}}
- @article{bengio2000neural, title={A neural probabilistic LLM}, author={Bengio, Yoshua and others}, journal={Advances in neural information processing systems}, volume={13}, year={2000}}
- @inproceedings{he2016deep, title={Deep residual learning for image recognition}, author={He, Kaiming and others}, booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, pages={770--778}, year={2016}}
- @article{chowdhery2022palm, title={Palm: Scaling LLMing with pathways}, author={Chowdhery, Aakanksha and others}, journal={arXiv preprint (Chowdhery et al., 2022 )}, year={2022}}
- @misc{openaiapi-pricing, title={OpenAI API Pricing}, author={OpenAI}, year={2023}}
- @misc{chat-cost, title={Tech giants' AI-like Bing, Bard poses billion-dollar search problem}, author={Reuters}, year={2023}}
- @misc{openaiapi, title={OpenAI API}, author={OpenAI}, year={2020}}
- @misc{chatgpt, title={OpenAI ChatGPT}, author={OpenAI}, year={2022}}
- @misc{chatgptuserprompt, title={Custom instructions for ChatGPT}, author={OpenAI}, year={2023}}
- @misc{copilot, title={Github Copilot}, author={Github}, year={2022}}
- @misc{bard, title={Google Bard}, author={Google}, year={2023}}
- @misc{amazonbedrock, title={Amazon Bedrock}, author={Amazon Web Services}, year={2023}}
- @article{wiseman2016sequence, title={Sequence-to-sequence learning as beam-search optimization}, author={Wiseman, Sam and Rush, Alexander M}, journal={arXiv preprint (Wiseman et al., 2016 )}, year={2016}}
- @inproceedings{yu2022orca, title={Orca: A Distributed Serving System for Transformer-Based Generative Models}, author={Yu, Gyeong-In and others}, booktitle={16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)}, pages={521--538}, year={2022}}
- @article{zhang2022opt, title={Opt: Open pre-trained transformer LLMs}, author={Zhang, Susan and others}, journal={arXiv preprint (Zhang et al., 2022 )}, year={2022}}
- @article{sutskever2014sequence, title={Sequence to sequence learning with neural networks}, author={Sutskever, Ilya and others}, journal={Advances in neural information processing systems}, volume={27}, year={2014}}
- @article{kilburn1962one, title={One-level storage system}, author={Kilburn, Tom and others}, journal={IRE Transactions on Electronic Computers}, number={2}, pages={223--235}, year={1962}, publisher={IEEE}}
- @article{li2023alpaserve, title={AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving}, author={Li, Zhuohan and others}, journal={arXiv preprint (Li et al., 2023 )}, year={2023}}
- @inproceedings{crankshaw2017clipper, title={Clipper: A Low-Latency Online Prediction Serving System}, author={Crankshaw, Daniel and others}, booktitle={14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)}, pages={613--627}, year={2017}}
- @inproceedings{crankshaw2020inferline, title={InferLine: latency-aware provisioning and scaling for prediction serving pipelines}, author={Crankshaw, Daniel and others}, booktitle={Proceedings of the 11th ACM Symposium on Cloud Computing}, pages={477--491}, year={2020}}
- @inproceedings{cui2022dvabatch, title={DVABatch: Diversity-aware Multi-Entry Multi-Exit Batching for Efficient Processing of DNN Services on GPUs}, author={Cui, Weihao and others}, booktitle={2022 USENIX Annual Technical Conference (USENIX ATC 22)}, pages={183--198}, year={2022}}
- @inproceedings{zhou2022pets, title={PetS: A Unified Framework for Parameter-Efficient Transformers Serving}, author={Zhou, Zhe and others}, booktitle={2022 USENIX Annual Technical Conference (USENIX ATC 22)}, pages={489--504}, year={2022}}
- @inproceedings{fang2021turbotransformers, title={TurboTransformers: an efficient GPU serving system for transformer models}, author={Fang, Jiarui and others}, booktitle={Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, pages={389--402}, year={2021}}
- @article{olston2017tensorflow, title={Tensorflow-serving: Flexible, high-performance ml serving}, author={Olston, Christopher and others}, journal={arXiv preprint (Olston et al., 2017 )}, year={2017}}
- @misc{nvidiatriton, title={Triton Inference Server}, author={NVIDIA}, year={2023}}
- @misc{sharegpt, title={ShareGPT}, url={https://sharegpt.com/}, author={ShareGPT Team}, year={2023}}
- @misc{alpaca, title={Stanford Alpaca: An Instruction-following LLaMA model}, author={Rohan Taori and others}, journal={GitHub repository}, year={2023}}
- @article{wang2022self, title={Self-Instruct: Aligning LLM with Self Generated Instructions}, author={Wang, Yizhong and others}, journal={arXiv preprint (Wang et al., 2022 )}, year={2022}}
- @misc{nvidiaft, title={FasterTransformer}, author={NVIDIA}, year={2023}}
- @misc{lmsysweek8, title={Chatbot Arena Leaderboard Week 8: Introducing MT-Bench and Vicuna-33B}, author={LMSYS ORG}, year={2023}}
- @misc{nccl, title={NCCL: The NVIDIA Collective Communication Library}, author={NVIDIA}, year={2023}}
- @misc{fastapi, title={FastAPI}, year={2023}}
- @article{jain2020checkmate, title={Checkmate: Breaking the memory wall with optimal tensor rematerialization}, author={Jain, Paras and others}, journal={Proceedings of Machine Learning and Systems}, volume={2}, pages={497--511}, year={2020}}
- @article{chen2016training, title={Training deep nets with sublinear memory cost}, author={Chen, Tianqi and others}, journal={arXiv preprint (Chen et al., 2016 )}, year={2016}}
- @inproceedings{ma2020rammer, title={Rammer: Enabling holistic deep learning compiler optimizations with rtasks}, author={Ma, Lingxiao and others}, booktitle={Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation}, pages={881--897}, year={2020}}
- @article{steiner2022olla, title={OLLA: Optimizing the Lifetime and Location of Arrays to Reduce the Memory Usage of Neural Networks}, author={Steiner, Benoit and others}, year={2022}}
- @article{rabe2021self, title={Self-attention Does Not Need Memory}, author={Rabe, Markus N and Staats, Charles}, journal={arXiv preprint (Rabe et al., 2021 )}, year={2021}}
These references provide a comprehensive background and context for the paper's contributions and methodology.