A helper class for managing memory on host and device.
using CudaStreamPtr = std::shared_ptr<CudaStream>
explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)
Construct a BufferManager.
- Parameters:
cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
inline ~BufferManager()
IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
of the given size on the GPU, using cudaMallocAsync.
ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
of the given dimensions on the GPU, using cudaMallocAsync.
IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
of the given size and memory type.
ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
of the given dimensions and memory type.
inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
of the given memory type. It may be resized later.
inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
of the given memory type. It may be reshaped later.
void copy(void const *src, IBuffer &dst, MemoryType srcType) const
void copy(IBuffer const &src, void *dst, MemoryType dstType) const
IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const
into a newIBuffer
with a potentially different memory type.
ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const
into a newITensor
with a potentially different memory type.
template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const Copy
into a newIBuffer
with a potentially different memory type.
template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
into a newITensor
with a potentially different memory type.
template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
into a newITensor
with a potentially different memory type.
CudaStream const &getStream() const
Get the underlying cuda stream.
std::size_t memoryPoolReserved() const
The current size of the memory reserved by the memory pool.
std::size_t memoryPoolUsed() const
The current size of the memory used by the memory pool.
std::size_t memoryPoolFree() const
The current size of the memory free in the memory pool.
void memoryPoolTrimTo(std::size_t size)
Try to trim the memory reserved by the pool to
bytes. This synchronizes implicitly with the stream.
static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
of the given size on the GPU, using cudaMalloc.
static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
of the given dimensions on the GPU, using cudaMalloc.
static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
of the given size on the CPU.
static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
of the given dimensions on the CPU.
static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
of the given size on the CPU.
static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
of the given dimensions on the CPU.
static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
of the given size on the CPU in the default memory pool.
static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
of the given dimensions on the CPU in the default memory pool.
static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
of the given size in UVM.
static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
of the given dimensions in UVM.
static void initMemoryPool(int device)
static std::size_t memoryPoolReserved(int device)
static std::size_t memoryPoolUsed(int device)
static inline std::size_t memoryPoolFree(int device)
static void memoryPoolTrimTo(int device, std::size_t size)
using pointer = cudaEvent_t
inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)
Creates a new cuda event. The event will be destroyed in the destructor.
- Parameters:
flags – Flags for event creation. By default, event timing is disabled.
inline explicit CudaEvent(pointer event, bool ownsEvent = true)
Pass an existing cuda event to this object.
- Parameters:
event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
inline void synchronize() const
Synchronizes the event.
using EventPtr = std::unique_ptr<element_type, Deleter>
class CudaStream
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
- Parameters:
flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)
Pass an existing cuda stream to this object.
- Parameters:
stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
inline explicit CudaStream(cudaStream_t stream)
Construct with an existing cuda stream or the default stream by passing nullptr.
inline int getDevice() const
Returns the device on which the stream was created.
inline cudaStream_t get() const
Returns the stream associated with this object.
inline void synchronize() const
Synchronizes the stream.
class DecodingInput
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxBatchSize, TensorPtr logits, TensorPtr endIds)
Public Members
SizeType32 step
SizeType32 maxLength
SizeType32 maxAttentionWindow
SizeType32 sinkTokenLength
SizeType32 maxBatchSize
SizeType32 maxStopWordsLen
SizeType32 maxBadWordsLen
std::optional<MedusaInputs> medusaInputs
class MedusaInputs
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxBatchSize, TensorPtr logits, TensorPtr endIds)
class DecodingInput
namespace runtime
namespace tensorrt_llm
namespace runtime
class DecodingOutput
static constexpr float kNegativeInfinity = -1e20f
class BeamHypotheses
Public Functions
void empty(BufferManager &manager)
void reshape(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
void release()
void init(BufferManager &manager, TokenIdType endId)
BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const
void empty(BufferManager &manager)
class MedusaOutputs
Subclassed by tensorrt_llm::runtime::GptDecoder< T >
virtual ~IGptDecoder() = default
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType32 maxSequenceLength, std::optional<TensorPtr> const &batchSlots = std::nullopt) = 0
virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) = 0
virtual SamplingConfig const &getSamplingConfig() = 0
static void acceptDraftTokensByIds(ITensor const &targetTokenIds, ITensor const &draftTokenIds, ITensor const &contextLengths, ITensor const &numDraftTokens, ITensor &sequenceLengths, ITensor const &finishedVec, ITensor &finishedFinal, ITensor &finishedSum, ITensor const &batchSlots, BufferManager::CudaStreamPtr const &stream)
static void acceptDraftTokensByLogits(ITensor &draftLogits, ITensor const &targetLogits, ITensor &draftProbs, ITensor &targetProbs, ITensor const &numDraftTokens, ITensor &finished, ITensor const &batchSlots, SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, curandState_t *curandState, BufferManager::CudaStreamPtr const &stream)
static inline std::unique_ptr<IGptDecoder> create(DecodingMode const &mode, nvinfer1::DataType dtype, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, BufferManager::CudaStreamPtr const &stream, std::optional<runtime::SizeType32> maxTokensPerStep = std::nullopt, std::optional<runtime::SizeType32> maxNumMedusaHeads = std::nullopt)
virtual ~IGptDecoder() = default
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder Public Types
using CudaStreamPtr = BufferManager::CudaStreamPtr
GptDecoder(DecodingMode const &mode, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, CudaStreamPtr const &stream, std::optional<runtime::SizeType32> maxTokensPerStep = std::nullopt, std::optional<runtime::SizeType32> maxNumMedusaHeads = std::nullopt)
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType32 maxSequenceLength, std::optional<TensorPtr> const &batchSlots = std::nullopt) override
virtual bool forward(DecodingOutput &output, DecodingInput const &input) override
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) override
inline virtual SamplingConfig const &getSamplingConfig() override
BufferManager mManager
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
SamplingConfig mSamplingConfig
size_t mMaxBatchSize
class GenericGenerationInput - #include <generationInput.h>
, is the token ID that marks the end of the input sequence (akaEOS
or end-of-sequence). It’s50,256
for the GPT2 model which has a vocabulary of50,257
tokens, for example,padId
, is the token ID that is used for padding (i.e. fills in the slots that are at an index greater-or-equal to the input length for padded sequences). It can be set to the same value asendId
, is the tensor of input IDs. That tensor must be allocated on the GPU. When the input tensor is padded, the shape ofids
is[batchSize, maxInputLength]
, wherebatchSize
must respect the maximum sizes insessionConfig
passed to theGptSession
constructor. When the input is packed, the shape ofids
, wherenumTokens
is the sum of the lengths of the different sequences in the batch,lengths
, is the tensor of input sequence lengths. That tensor must be allocated on the GPU and containbatchSize
, indicates if theids
tensor is packed or padded. In this release, that flag must match the value passed to the constructor through the instance of theModelConfig
class. In a future release, the session may be made more flexible and automatically pad or pack the input,
, is a tensor of floating-point values on the GPU that contains the bias to add to the logits during sampling (after the projection from hidden states to logits as the last step of the model). This tensor must havevocabSize
elements (as defined in themodelConfig
argument passed to the constructor),badWordsList
, is a tensor of integers on the GPU that encodes the list of words that have to be banned from generated sequences. Its shape is[2, badWordsLength]
, as explained below, or[batchSize, 2, badWordsLength]
when there is a different list for each sequence in the batch,stopWordsList
, is a tensor of integers on the GPU that encodes the list of words that trigger the end of the generation for a sequence. Its shape is[2, stopWordsLength]
, as explained below, or[batchSize, 2, stopWordsLength]
when there is a different list for each sequence in the batch,maxNewTokens
, is the maximum number of tokens to generate.
tensors have the same shape[2, length]
. Let’s consider an example with three words to describe the representation of those lists. The first word contains tokens[5, 7, 3]
, the second one contains[9, 2]
and the third one is composed of tokens[6, 2, 4, 1]
. In total, there are 9 tokens. That’s the length. The shape of the tensor is[2, 9]
. The first row of the tensor must contain the 9 token IDs and the second row must store the inclusive prefix-sum of the word lengths as shown on the following diagram:0 3 5 9 | | | | V V V V [ 5, 7, 3, 9, 2, 6, 2, 4, 1] [ 3, 5, 9, -1, -1, -1, -1, -1, -1]
In case all the words are made of a single token, the inner-most dimension of the tensor must be increased by 1 (i.e. the length for 4 words, each made of a single token, must be 5 instead of 4 — the shape is [2, 5]).
[2, 5]
inline explicit GenericGenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
Public Members
SizeType32 endId
SizeType32 padId
bool packed
std::optional<SizeType32> maxNewTokens
PromptTuningParams promptTuningParams
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
inline explicit GenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
template<typename TTensor>
class GenericGenerationOutput - #include <generationOutput.h>
, is a tensor that contains the output token IDs. Its shape is[batchSize, beamWidth, maxSeqLength]
is the sum ofmaxInputLength
. After generation, it contains, for each sequence, a copy of the input tokens followed by the output tokens. When a sequence is shorter thanmaxSeqLength
, padding tokens are added at the end of the sequence.
Note that the shape of that tensor is different in this version of TensorRT-LLM from its shape in previous versions where it was [batchSize, beamWidth, maxInputLength + maxNewTokens].
, is a tensor of floating-point values on the GPU to store the log-prob of the generated tokens. Its shape is[maxNewTokens, batchSize, beamWidth]
. Its shape will likely change in a future release to match the shape of the outputids
, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the context. Its shape is[batchSize, maxSequenceLength, vocabSizePadded]
. If useremove_input_padding
, its shape is[packedSize, vocabSizePadded]
. This buffer will only be filled in if the TensorRT engine was built with thegather_context_logits
parameter enabled.After inference is complete, you can get the context logits in
, these are variables on the GPU. For specific acquisition methods, please refer to the example of gptSessionBenchmark.cpp.It is important to point out that enabling the computation may have an impact on performance (the language modeling head (LM head) has to perform a matrix multiplication on all the context tokens instead of a just the last one).
, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the generation. Its shape is[batchSize, beamWidth, maxOutputLen, vocabSizePadded]
. This buffer will only be filled in if the TensorRT engine was built with thegather_generation_logits
parameter enabled.Generation logits can also be obtained through
after inference is completed.onTokenGenerated
, is a callback function invoked in the generation loop to pass newly generated tokens to the caller while the loop continues to execute. An implementation of that callback must accept the outputids
tensor, the generationstep
and a boolean flag that indicates if the generation is complete.
using Callback = std::function<void(TensorPtr const &ids, SizeType32 step, bool finished)>
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>
using Base = GenericGenerationOutput<ITensor::SharedPtr>
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
- #include <gptDecoderBatch.h>
GPT decoder class with support for in-flight batching.
GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)
virtual void setup(DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, bool fusedDecoder, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override
Setup the decoder before calling
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override
Initialize the decoder with new batch of inputs.
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) override
Initialize batched decoder at seqSlots with a new
virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override
Run one step for all requests without blocking the host process and return the token for synchronization.
virtual void forwardSync(decoder_batch::Token const &token) override
Wait for the call to
associated with a token to complete.
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override
Run one step for all requests without blocking the host thread.
virtual void forwardSync() override
Wait for the last call to
to complete.
inline virtual std::vector<bool> getFinished() const override
- Returns:
[batchSize], indicators of finished requests
inline virtual TensorPtr getOutputIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
, on gpu
inline virtual TensorPtr getOutputIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu
virtual CudaEvent finalize(SizeType32 batchIdx) const override
Gather final beam search results for request
. Result will only be available after event returned.
virtual void finalize() const override
Gather final beam search results for all requests.
inline virtual TensorPtr getParentIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
inline virtual TensorPtr getCumLogProbs() const override
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth], cumulative log probabilities (per beam), on gpu
inline virtual TensorPtr getLogProbs() const override
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
inline virtual TensorPtr getAllNewTokens() const override
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
(per beam), on gpu
inline virtual std::vector<SizeType32> getNbSteps() const override
- Returns:
[batchSize], the number of generation steps executed on each request
inline virtual TensorPtr getNbFinished() const override
- Returns:
[1], number of finished sequences, in pinned host memory
inline virtual TensorPtr getNextDraftTokens() const override
- Returns:
[batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu
using GptDecoderPtr = std::unique_ptr<IGptDecoder>
using DecodingInputPtr = std::unique_ptr<DecodingInput>
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>
CudaEvent postProcessRequest(SizeType32 batchIdx) const
Gather final beam search results for request
void newRequest(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Initialize the decoder at
with a newrequest
void allocateMedusaBuffers()
Allocate buffers for medusa decoding.
void setupMedusa(ModelConfig const &modelConfig)
Setup buffers for medusa decoding.
void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Setups decoder internal tensors for new speculative decoding request.
void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Medusa request.
void forwardAsyncUnfusedDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, CudaEvent const &eventStart)
Asynchronously calls unfused decoder for whole batch in loop.
void forwardAsyncFusedDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, CudaEvent const &eventStart)
Asynchronously calls fused decoder for whole batch.
std::size_t const mVocabSize
std::size_t const mVocabSizePadded
CudaStreamPtr mStream
BufferManager mBufferManager
TokenPtr mForwardToken
std::vector<CudaStreamPtr> mStreams
std::vector<GptDecoderPtr> mDecoders
std::vector<DecodingInputPtr> mDecodingInputs
std::vector<DecodingOutputPtr> mDecodingOutputs
DecodingInputPtr mJointDecodingInput
DecodingOutputPtr mJointDecodingOutput
std::vector<bool> mAcceptByLogits
std::vector<SizeType32> mNbSteps
std::vector<bool> mFinished
std::vector<SizeType32> mMaxNewTokens
std::vector<SizeType32> mBeamWidths
std::vector<SizeType32> mGeneratedTokensPerEngineStep
SizeType32 mMaxSequenceLength = {}
SizeType32 mMaxAttentionWindow = {}
SizeType32 mSinkTokenLength = {}
SizeType32 mActualBatchSize = {}
SizeType32 mMaxTokensPerEngineStep = {}
SizeType32 mMaxStopWordsLen = {}
SizeType32 mMaxBadWordsLen = {}
SizeType32 mMaxTokensPerDecoderStep = {}
bool mFusedDecoder = {false}
bool mUseMedusa = {false}
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 gpusPerNode, ModelConfig const &modelConfig)
inline ModelConfig getModelConfig() const
inline std::string const &getName() const
inline std::string const &getVersion() const
inline std::string const &getPrecision() const
inline constexpr SizeType32 getTensorParallelism() const
inline constexpr SizeType32 getPipelineParallelism() const
inline constexpr SizeType32 getGpusPerNode() const
inline constexpr SizeType32 getWorldSize() const
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
inline std::string engineFilename(WorldConfig const &worldConfig) const
static GptJsonConfig parse(std::string const &json)
static GptJsonConfig parse(std::istream &json)
static GptJsonConfig parse(std::filesystem::path const &path)
std::string const mName
std::string const mVersion
std::string const mPrecision
SizeType32 const mTensorParallelism
SizeType32 const mPipelineParallelism
SizeType32 const mGpusPerNode
ModelConfig const mModelConfig
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 gpusPerNode, ModelConfig const &modelConfig)
class GptJsonConfig
namespace runtime
namespace tensorrt_llm
namespace runtime
class GptSession
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
- Parameters:
sessionConfig – Configuration of the session,
modelConfig – Description of the model,
worldConfig – Description of the environment,
engineBuffer – The compiled TensorRT engine (const void*),
engineSize – The size in bytes of the TensorRT engine (size_t),
logger – The optional logger.
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
BufferManager const &getBufferManager() const
inline ModelConfig const &getModelConfig() const
inline WorldConfig const &getWorldConfig() const
inline int getDevice() const noexcept
inline bool getNormalizeLogProbs() const noexcept
This function performs the generation loop.
Given input tensors to read from, output tensors to populate, that member function can be produced or each sequence has reached completion (due to the production will run the generation loop until it reaches the maximum number of tokens that of “end-of-sequence” or a word in the list of “stop words”). The pseudo-code of that function looks like (member function names were changed to keep the presentation simple):
// Have all the sequences in the batch reached completion? bool allFinished = false; // Until all sequences are finished or the number of steps reaches the limit... for (int step = 0; !allFinished && step < maxNewTokens; ++step) { // Trigger the computation of the logits... computeLogits(...); // Run the sampling to produce a token (for each active sequence) from the logits. allFinished = generateTokensFromLogits(...); // Callback to stream the output tokens while the generation loop continues. onTokenGenerated(...); }
void setLayerProfiler()
Set LayerProfiler to collect performance per layer.
std::string getLayerProfileInfo() const
Print profile information per layer.
using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
using TokenGeneratedCallback = std::function<void(SizeType32 step, bool finished)>
inline bool useCudaGraphs()
void createContexts()
void createBuffers(SizeType32 numMicroBatches)
void createDecoders(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType32 numMicroBatches, DecodingMode const &decodingMode)
void createKvCacheManager(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, KvCacheConfig const &config)
void createCustomAllReduceWorkspace(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType32> const &generationBatchesOffsets, KvCacheManager const *kvCacheManager)
SizeType32 executeGenerationStep(SizeType32 step, std::vector<GenerationInput> const µBatchesInputs, std::vector<GenerationOutput> µBatchesOutputs, std::vector<SizeType32> const µBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> µBatchesFinished)
void decoderStepAsync(SizeType32 decoderStep, SizeType32 microBatchId)
Execute decoder on last PP rank, receive decoder output on other PP ranks.
bool shouldStopSync(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 microBatchId)
Synchronize with the decoder and return the
void finalize(SizeType32 microBatchId)
Collect final output ids and log probs on last PP rank and send them to first PP rank.
Receives are asynchronous on host, so synchronization is required before access.
void kvCacheAddSequences(SizeType32 beamWidth, SizeType32 microBatchId, SizeType32 firstBatchIdx)
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType32 microBatchId) const
Populate outputIds and return reference to newTokens tensor.
TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)
ModelConfig const mModelConfig
WorldConfig const mWorldConfig
int mDevice = {-1}
std::shared_ptr<NcclCommunicator> mPipelineComm
std::shared_ptr<CudaStream> mCommStream
std::shared_ptr<AllReduceBuffers> mAllReduceBuffers
SizeType32 mDecoderMaxSequenceLength = {}
SizeType32 mDecoderMaxAttentionWindow = {}
SizeType32 mDecoderSinkTokenLength = {}
std::shared_ptr<TllmRuntime> mRuntime
std::shared_ptr<KvCacheManager> mKvCacheManager
MicroBatchConfig mMicroBatchConfig
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
bool mCudaGraphMode = {false}
std::vector<CudaGraphExecutor> mCudaGraphInstances
bool mNormalizeLogProbs = true
- #include <gptSession.h>
Configuration for session execution and buffer sizes.
may be called with batch size and beam width smaller than the configured parameters.maxBatchSize
inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)
SizeType32 maxBatchSize
SizeType32 maxBeamWidth
SizeType32 maxSequenceLength
float gpuWeightsPercent
bool decoderPerRequest = {false}
bool cudaGraphMode = {false}
KvCacheConfig kvCacheConfig = {}
std::optional<SizeType32> ctxMicroBatchSize = std::nullopt
std::optional<SizeType32> genMicroBatchSize = std::nullopt
std::optional<DecodingMode> decodingMode = std::nullopt
bool normalizeLogProbs = true
void prepareNextGraph(TllmRuntime const &runtime, SizeType32 nextContextId)
void launch(CudaStream const &stream)
void create(cudaGraph_t const &graph)
bool update(cudaGraph_t const &graph)
void uploadToStream(CudaStream const &stream)
cudaGraphExec_t mInstance
CudaGraphExecutor() = default
class GenerationProfiler
- #include <gptSession.h>
Optional profiler class to profile the generation phase of an inference request.
static constexpr unsigned int flags = {cudaEventDefault}
SizeType32 numCtxBatches
SizeType32 numGenBatches
SizeType32 ctxBatchSize
SizeType32 genBatchSize
static constexpr auto value = "GPU"
static constexpr auto value = "GPU"
struct MemoryTypeString<MemoryType::kCPU> Public Static Attributes
static constexpr auto value = "CPU"
static constexpr auto value = "CPU"
struct MemoryTypeString<MemoryType::kPINNED> Public Static Attributes
static constexpr auto value = "PINNED"
static constexpr auto value = "PINNED"
struct MemoryTypeString<MemoryType::kUVM> Public Static Attributes
static constexpr auto value = "UVM"
static constexpr auto value = "UVM"
struct DataTypeTraits<nvinfer1::DataType::kINT32> Public Types
using type = std::int32_t
using type = std::int32_t
struct DataTypeTraits<nvinfer1::DataType::kINT64> Public Types
using type = std::int64_t
using type = std::int64_t
struct DataTypeTraits<nvinfer1::DataType::kINT32, true> Public Types
using type = std::uint32_t
using type = std::uint32_t
struct DataTypeTraits<nvinfer1::DataType::kINT64, true> Public Types
using type = std::uint64_t
using type = std::uint64_t
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned> Public Types
using type = bool
using type = bool
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned> Public Types
using type = std::uint8_t
using type = std::uint8_t
struct TRTDataType<std::int8_t>
struct TRTDataType<std::int32_t>
struct TRTDataType<std::uint32_t> Public Static Attributes
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
struct TRTDataType<std::int64_t>
struct TRTDataType<std::uint64_t> Public Static Attributes
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
struct TRTDataType<std::uint8_t>
struct TRTDataType<kernels::KVCacheIndex> Public Static Attributes
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
namespace tensorrt_llm
namespace runtime
template<MemoryType T>
struct MemoryTypeString
- template<> kGPU >
static constexpr auto value = "GPU"
- template<> kPINNED >
static constexpr auto value = "PINNED"
struct DataTypeTraits - #include <iBuffer.h>
For converting a TensorRT data type to a C++ data type.
- template<> kFLOAT >
using type = float
- template<> kHALF >
Public Types
using type = half
Public Static Attributes
static constexpr char name[] = "half"
static constexpr auto size = sizeof(type)
using type = half
Public Types
Public Static Attributes
static constexpr char name[] = "int8"
static constexpr auto size = sizeof(type)
Public Types
Public Static Attributes
static constexpr char name[] = "int32"
static constexpr auto size = sizeof(type)
Public Types
Public Static Attributes
static constexpr char name[] = "int64"
static constexpr auto size = sizeof(type)
Public Types
Public Static Attributes
static constexpr char name[] = "uint32"
static constexpr auto size = sizeof(type)
Public Types
Public Static Attributes
static constexpr char name[] = "uint64"
static constexpr auto size = sizeof(type)
Public Types
Public Static Attributes
static constexpr char name[] = "bool"
static constexpr auto size = sizeof(type)
Public Types
Public Static Attributes
static constexpr char name[] = "uint8"
static constexpr auto size = sizeof(type)
struct TRTDataType - #include <iBuffer.h>
For converting a C++ data type to a TensorRT data type.
virtual void *data() = 0
Returns a pointer to underlying array.
virtual void const *data() const = 0
Returns a pointer to underlying array.
inline virtual void *data(std::size_t index)
Returns a pointer to the underlying array at a given element index.
inline virtual void const *data(std::size_t index) const
Returns a pointer to the underlying array at a given element index.
virtual std::size_t getSize() const = 0
Returns the size (in number of elements) of the buffer.
inline virtual std::size_t getSizeInBytes() const
Returns the size (in bytes) of the buffer.
virtual std::size_t getCapacity() const = 0
Returns the capacity of the buffer.
virtual char const *getDataTypeName() const
virtual MemoryType getMemoryType() const = 0
Returns the memory type of the buffer.
virtual char const *getMemoryTypeName() const
virtual void resize(std::size_t newSize) = 0
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
virtual void release() = 0
Releases the buffer. It will be reset to nullptr.
virtual ~IBuffer() = default
Creates a sliced view on the underlying
. The view will have the same data type asbuffer
.- Parameters:
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
- Returns:
A view on the
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
which can be independently resized.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
Returns a view on the underlying
with a different size.- Parameters:
tensor – The tensor to view.
size – The size of the view.
- Returns:
A view on the
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)
Wraps the given
in anIBuffer
. TheIBuffer
will not own the underlyingdata
and cannot be resized beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
.size – The size of the buffer.
capacity – The capacity of the buffer.
- Returns:
static MemoryType memoryType(void const *data)
Determine the memory type of a pointer.
namespace tensorrt_llm
namespace runtime
class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
- #include <iGptDecoderBatch.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::GptDecoderBatch
using CudaStreamPtr = std::shared_ptr<CudaStream>
using TokenPtr = std::unique_ptr<decoder_batch::Token const>
virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0
Run one step for all requests without blocking the host process and return the token for synchronization.
virtual void forwardSync(decoder_batch::Token const &token) = 0
Wait for the call to
associated with a token to complete.
inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)
Run one step for all requests and wait for completion on the host.
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
, on gpu
virtual CudaEvent finalize(SizeType32 batchIdx) const = 0
Gather final beam search results for request
. Result will only be available after event returned.
virtual std::vector<bool> getFinished() const = 0
- Returns:
[batchSize (actual)], marks finished requests (per batch)
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, beamWidth], cumulative log probabilities (per beam), on gpu
virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const = 0
- Returns:
[beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu
virtual TensorPtr getLogProbs(SizeType32 batchIdx) const = 0
- Returns:
[beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu
virtual std::vector<SizeType32> getNbSteps() const = 0
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) = 0
Initialize batched decoder at seqSlots with a new
virtual TensorPtr getNextDraftTokens() const = 0
- Returns:
[batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu
IGptDecoderBatch() = default
using ConstTensorPtr = ITensor::SharedConstPtr
Public Functions
inline explicit Request(ConstTensorPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt, std::optional<SizeType32> endId = std::nullopt)
Public Members
ConstTensorPtr ids
SizeType32 inputLen
std::optional<SizeType32> maxNewTokens
std::optional<SizeType32> endId
bool computeCumLogProbs
bool computeLogProbs
SizeType32 generatedTokensPerEngineStep
class Request
- #include <iStatefulGptDecoder.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::IGptDecoderBatch
using CudaStreamPtr = std::shared_ptr<CudaStream>
virtual void setup(DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, bool fusedDecoder, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0
Setup the decoder before calling
, also calls reshapeBuffers.
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) = 0
Initialize the decoder with new batch of inputs.
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0
Run one step for all requests without blocking the host thread.
virtual void forwardSync() = 0
Wait for the last call to
to complete.
inline virtual void forward(decoder::Output &output, decoder::Input const &input)
Run one step for all requests.
virtual void finalize() const = 0
Gather final beam search results for all requests.
virtual TensorPtr getOutputIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
(per beam), on gpu
virtual TensorPtr getAllNewTokens() const = 0
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
virtual TensorPtr getNbFinished() const = 0
- Returns:
[1], number of finished sequences, in pinned host memory
virtual ~IStatefulGptDecoder() = default
IStatefulGptDecoder() = default
~ITensor() override = default
virtual void reshape(Shape const &dims) = 0
Sets the tensor dimensions. The new size of the tensor will be
inline virtual void resize(std::size_t newSize) override
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
inline void squeeze(SizeType32 dim)
Removes the given unit dimensions from this tensor.
inline void unsqueeze(SizeType32 dim)
Adds a unit dimension at the specified position.
inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const
template<typename T>
inline bool shapeEquals(T const *dims, SizeType32 count) const
static inline std::int64_t volume(Shape const &dims)
Returns the volume of the dimensions. Returns -1 if
d.nbDims < 0
static inline std::size_t volumeNonNegative(Shape const &shape)
Returns the volume of the dimensions. Throws if
d.nbDims < 0
static Shape squeeze(Shape const &shape, SizeType32 dim)
Removes the given unit dimension from
.- Parameters:
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
- Returns:
A new shape without the unit dimension.
static Shape unsqueeze(Shape const &shape, SizeType32 dim)
Add a unit dimension to
at the specified position.- Parameters:
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
- Returns:
A new shape with the added unit dimension.
Creates a sliced view on the underlying
. The view will have the same data type astensor
.- Parameters:
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
- Returns:
A view on the
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
(or tensor) with the given shape.- Parameters:
tensor – The tensor to view.
shape – The shape of the view.
- Returns:
A view on the
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
Returns a view on the underlying
which can be independently reshaped.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)
Wraps the given
in anITensor
. TheITensor
will not own the underlyingdata
and cannot be reshaped beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
.shape – The shape of the tensor.
capacity – The capacity of the buffer.
- Returns:
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
static Shape makeShape(std::initializer_list<DimType64> const &dims)
A convenience function to create a tensor shape with the given dimensions.
static std::string toString(Shape const &dims)
A convenience function for converting a tensor shape to a
static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)
A convenience function to compare shapes.
template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count) A convenience function to compare shapes.
Public Functions
IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)
inline std::vector<void*> const &getCommPtrs() const
static constexpr size_t FLAGS_SIZE = (kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)
void allocateIpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)
void destroyIpcMemory()
class DecodingMode
using UnderlyingType = uint8_t
inline constexpr bool isNone() const
inline constexpr bool isTopK() const
inline constexpr bool isTopP() const
inline constexpr bool isTopKorTopP() const
inline constexpr bool isTopKandTopP() const
inline constexpr bool isBeamSearch() const
inline constexpr bool isMedusa() const
inline bool operator==(DecodingMode const &other) const
static inline constexpr auto None()
static inline constexpr auto TopK()
static inline constexpr auto TopP()
static inline constexpr auto TopKTopP()
static inline constexpr auto BeamSearch()
static inline constexpr auto Medusa()
static inline DecodingMode fromExecutor(executor::DecodingMode decodingMode)
inline constexpr DecodingMode(UnderlyingType state)
inline constexpr bool anyBitSet(UnderlyingType bits) const
inline constexpr bool allBitSet(UnderlyingType bits) const
UnderlyingType mState = {}
static constexpr UnderlyingType kNone = {0}
static constexpr UnderlyingType kTopK = {1u << 0}
static constexpr UnderlyingType kTopP = {1u << 1}
static constexpr UnderlyingType kBeamSearch = {1u << 2}
static constexpr UnderlyingType kMedusa = {1u << 3}
static constexpr UnderlyingType kTopKTopP = {kTopK | kTopP}
friend std::ostream &operator<<(std::ostream &os, DecodingMode other)
std::ostream &operator<<(std::ostream &os, LoraCache::TaskLayerModuleConfig const &v)
class LoraExpectedException : public std::runtime_error
Subclassed by tensorrt_llm::runtime::LoraCacheFullException
class LoraCacheFullException : public tensorrt_llm::runtime::LoraExpectedException
class LoraCachePageManager
- #include <loraCache.h>
Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device
Note that this class is not thread safe
Public Functions
LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
- Parameters:
config – [in] a LoraCachePageManagerConfig
bufferManager – [in] a Buffermanager used to allocate page blocks
std::optional<std::vector<std::size_t>> claimPages(SizeType32 numPages)
claim pages
- Parameters:
numPages – [in] number of pages to claim
- Returns:
a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds
SizeType32 numAvailablePages() const
get number of available (free) pages in manager
- Returns:
number of free pages in manager
void releasePages(std::vector<std::size_t> const &pages)
release given pages
- Parameters:
pages – [in] list of pages to release (free)
ITensor::SharedConstPtr blockPtr(SizeType32 blockIdx) const
return pointer to given page block
- Parameters:
blockIdx; – [in]
- Returns:
— pointer to page block
ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const
return pointer to given page
- Parameters:
pageIdx – [in]
- Returns:
— const pointer to page
Private Functions
void initialize(BufferManager const &bufferManager)
LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
class LoraCache
- #include <loraCache.h>
Caches LoRA weights with LRU eviction policy.
Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.
A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.
Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.
The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)
Cache pages are allocated on one or more blocks
using TaskIdType = std::uint64_t
using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<TaskLayerModuleConfig>>
LoraCache(LoraCachePageManagerConfig const &pageManagerConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager const &bufferManager)
param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks
void put(TaskIdType taskId, TensorPtr weights, TensorPtr config, bool load = true)
put a task in the cache, and claim pages for it, and optionally load task weights.
- Parameters:
taskId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
load – [in] if true load weights before returning, otherwise do not
void loadWeights(TaskIdType taskId, TensorPtr weights, TensorPtr config)
load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false
- Parameters:
taslId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
inline bool isLoaded(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is loaded (weights are in place) and false otherwise
bool isDone(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is marked done and can be evicted
inline bool has(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is in the cache (not necessarily loaded) and false otherwise
std::shared_ptr<std::vector<TaskLayerModuleConfig>> get(TaskIdType taskId)
- Parameters:
taskId – [in] the task id
- Returns:
— list of Value objects with pointers to task weights
void bump(TaskIdType taskId)
bump task and make it the most recently used
- Parameters:
taskId – [in] the task id
void markTaskDone(TaskIdType taskId)
mark task done meaning it can be evicted
- Parameters:
taskId – [in] the task id
void markAllDone()
mark all tasks in cache done
SizeType32 determineNumPages(TaskIdType taskId) const
- Parameters:
taskId – [in] the taskid
- Returns:
— number of pages needed to store the given task
SizeType32 determineNumPages(TensorPtr config) const
- Parameters:
config – [in] lora config tensor
- Returns:
— number of pages needed to store the task configured with config tensor
bool fits(TensorPtr config) const
- Parameters:
config – [in] a lora config tensor
- Returns:
— true in task fits in cache false otherwise
void copyTask(TaskIdType taskId, LoraCache &deviceCache, bool markDone = false)
copy task to another cache. Caches must have the same page size.
- Parameters:
taskId – [in] the task id to copy
otherCache – [in] the LoraCache to move the task to
markDone – [in] mark the copied task done as it’s copied
SizeType32 getNumPages() const
- Returns:
— total number of pages allocated to cache (used or not)
ITensor::SharedConstPtr getPagePtr(size_t pageId) const
- Parameters:
pageId – [in] the page id
- Returns:
— const pointer to page
static std::vector<LoraCache::TaskLayerModuleConfig> copyToPages(TensorPtr weights, TensorPtr config, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::unordered_map<SizeType32, LoraModule> moduleIdToModel, BufferManager const &manager, std::vector<TensorPtr> const &pages, std::vector<std::size_t> const &pageIds)
Copy task weights to cache pages.
- Parameters:
weights – [in] task weights
config – [in] task config tensor
modelConfig – [in] a ModelConfig
worldConfig – [in] a WorldConfig
modelIdToModel – [in] map from lora module id to LoraModule
manager – [in] a BufferManager the manager to use to perform the copies
pages – [out] list of page tensors to copy weights to
pageIds – [in] page ids for the pages
- Returns:
— list of cache Values objects
static void splitTransposeCpu(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)
splits second dim of input into tpSize parts and writes the tpRank split to output
- Parameters:
output – [out] output tensor
input – [in] input tensor
tpSize – [in] number of splits
tpRank – [in] the split to write to output
void bumpTaskInProgress(TaskIdType taskId)
ValueStatus getStatus(TaskIdType taskId) const
std::vector<std::size_t> claimPagesWithEvict(SizeType32 numPages)
claim numPages, evicting tasks if needed
- Parameters:
numPages – [in] number of pages to claim
- Throws:
std::runtime_error – if all pages cannot be claimed
- Returns:
— list of page ids
std::map<size_t, std::pair<size_t, SizeType32>> copyTaskMapPages(TaskValue &targetTaskValue, TaskValue const &sourceTaskValue, std::vector<size_t> const &targetPageIds, LoraCache const &targetCache)
Internal helper method used inside copyTask. Not thread safe on its own
LoraCachePageManagerConfig mPageManagerConfig
ModelConfig mModelConfig
WorldConfig mWorldConfig
mutable std::mutex mPagesMutex
std::unique_ptr<LoraCachePageManager> mCachePageManager
mutable std::mutex mCacheMutex
std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap
std::list<TaskIdType> mInProgressTasks
std::list<TaskIdType> mDoneTasks
std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers
std::unique_ptr<BufferManager> mBufferManager
std::unordered_map<SizeType32, LoraModule> mModuleIdToModule
template<typename T>
static void splitTransposeCpuInner(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)
- #include <loraCache.h>
Contains information on a single layer / module. A list of these configs is associated with each task and can be used to populate runtime tensors.
std::string toString() const
bool operator==(LoraCache::TaskLayerModuleConfig const &o) const
Public Members
std::size_t pageId
SizeType32 slotIdx
SizeType32 inSize
SizeType32 outSize
SizeType32 moduleId
SizeType32 layerId
SizeType32 adapterSize
SizeType32 numSlots
std::int64_t weightsInPointer
std::int64_t weightsOutPointer
inline SizeType32 getGpu() const
inline SizeType32 getCpu() const
inline SizeType32 getPinned() const
inline SizeType32 getUVM() const
inline void allocate(SizeType32 size)
void allocate(MemoryType memoryType, SizeType32 size)
template<MemoryType T>
inline void deallocate(SizeType32 size)
void deallocate(MemoryType memoryType, SizeType32 size)
std::string toString() const
Public Static Functions
static MemoryCounters &getInstance()
static std::string bytesToString(SizeType32 bytes, int precision = 2)
Private Members
std::atomic<SizeType32> mGpu = {}
std::atomic<SizeType32> mCpu = {}
std::atomic<SizeType32> mPinned = {}
std::atomic<SizeType32> mUVM = {}
class GenericPromptTuningParams -
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>
using SizeType32 = GenericPromptTuningParams::SizeType32
inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)
void fillTasksTensor(TensorPtr tasksHost, const SizeType32 batchSize, const SizeType32 numContextRequests, std::vector<SizeType32> const &reqBeamWidths, std::vector<SizeType32> const &reqPromptLengths, BufferManager const &manager, bool packedInput)
inline explicit SamplingConfig(SizeType32 beamWidth = 1)
inline explicit SamplingConfig(std::vector<SamplingConfig> const &configs)
inline explicit SamplingConfig(executor::SamplingConfig const &samplingConfig, std::optional<executor::SpeculativeDecodingConfig> const &specDecodingConfig)
inline bool operator==(SamplingConfig const &other) const
Public Members
SizeType32 beamWidth
OptVec<SizeType32> minLength
OptVec<SizeType32> topK
OptVec<TokenIdType> topPResetIds
OptVec<SizeType32> earlyStopping
OptVec<std::vector<runtime::SizeType32>> topKMedusaHeads
std::optional<bool> normalizeLogProbs
using FloatType = float
Public Functions
explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
inline constexpr SizeType32 getSize() const noexcept
inline constexpr SizeType32 getTensorParallelism() const noexcept
inline constexpr bool isTensorParallel() const noexcept
inline constexpr SizeType32 getPipelineParallelism() const noexcept
inline constexpr bool isPipelineParallel() const noexcept
inline constexpr SizeType32 getRank() const noexcept
inline constexpr SizeType32 getGpusPerNode() const noexcept
inline SizeType32 getGpusPerGroup() const noexcept
inline SizeType32 getDevice() const noexcept
inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept
inline constexpr SizeType32 getPipelineParallelRank() const noexcept
inline constexpr SizeType32 getTensorParallelRank() const noexcept
inline constexpr SizeType32 getLocalRank() const noexcept
inline constexpr SizeType32 getNodeRank() const noexcept
inline constexpr SizeType32 getNodeRankOf(SizeType32 rank) const noexcept
inline constexpr bool isFirstPipelineParallelRank() const noexcept
inline constexpr bool isLastPipelineParallelRank() const noexcept
Is my rank the last rank in its pipeline?
inline constexpr SizeType32 getLastRank() const noexcept
std::vector<SizeType32> getPipelineParallelGroup() const
std::vector<SizeType32> getTensorParallelGroup() const
bool validMpiConfig() const
Public Static Functions
static WorldConfig mpi(SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType32> tensorParallelism = std::nullopt, std::optional<SizeType32> pipelineParallelism = std::nullopt, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
Public Static Attributes
static constexpr SizeType32 kDefaultGpusPerNode = 1
SizeType32 mTensorParallelism
SizeType32 mPipelineParallelism
SizeType32 mRank
SizeType32 mGpusPerNode
std::vector<SizeType32> mDeviceIds
- #include <loraCachePageManagerConfig.h>
Configuration for LoraCachePageManager
See LoraCache docs for description of pages, slots, and page blocks.
inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)
inline constexpr runtime::MemoryType getMemoryType() const noexcept
inline constexpr void setMemoryType(runtime::MemoryType const &memoryType) noexcept
inline constexpr SizeType32 getTotalNumPages() const noexcept
inline constexpr void setTotalNumPage(SizeType32 const &totalNumPages) noexcept
inline constexpr SizeType32 getMaxPagesPerBlock() const noexcept
inline constexpr void setMaxPagesPerBlock(SizeType32 const &maxPagesPerBlock) noexcept
inline constexpr SizeType32 getSlotsPerPage() const noexcept
inline constexpr void setSlotsPerPage(SizeType32 const &slotsPerPage) noexcept
inline constexpr SizeType32 getPageWidth() const noexcept
inline constexpr void setPageWidth(SizeType32 const &pageWidth) noexcept
inline constexpr bool getInitToZero() const noexcept
inline constexpr void setInitToZero(bool initToZero) noexcept
inline constexpr SizeType32 getNumCopyStreams() const noexcept
inline constexpr void setNumCopyStreams(SizeType32 numCopyStreams) noexcept
runtime::MemoryType mMemoryType
SizeType32 mTotalNumPages
SizeType32 mMaxPagesPerBlock
SizeType32 mSlotsPerPage
SizeType32 mPageWidth
SizeType32 mNumCopyStreams = 1
bool mInitToZero
Public Types
enumerator kINVALID
enumerator kATTN_QKV
enumerator kATTN_Q
enumerator kATTN_K
enumerator kATTN_V
enumerator kATTN_DENSE
enumerator kMLP_H_TO_4H
enumerator kMLP_4H_TO_H
enumerator kMLP_GATE
enumerator kCROSS_ATTN_QKV
enumerator kCROSS_ATTN_Q
enumerator kCROSS_ATTN_K
enumerator kCROSS_ATTN_V
enumerator kCROSS_ATTN_DENSE
enumerator kMOE_H_TO_4H
enumerator kMOE_4H_TO_H
enumerator kMOE_GATE
enumerator kMOE_ROUTER
enumerator kINVALID
inline explicit constexpr LoraModule(ModuleType const &t, SizeType32 inDim, SizeType32 outDim, bool inDimFirst, bool outDimFirst, SizeType32 inTpSplitDim, SizeType32 outTpSplitDim) noexcept
inline explicit constexpr LoraModule() noexcept
explicit constexpr LoraModule(LoraModule const &o) = default
constexpr LoraModule &operator=(LoraModule const &o) = default
inline constexpr SizeType32 flattenedInOutSize(SizeType32 adapterSize) const noexcept
inline constexpr SizeType32 inSize(SizeType32 adapterSize) const noexcept
inline constexpr SizeType32 outSize(SizeType32 adapterSize) const noexcept
inline constexpr SizeType32 localInSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline constexpr SizeType32 localOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline constexpr SizeType32 localInDim(SizeType32 tpSize) const noexcept
inline constexpr SizeType32 localOutDim(SizeType32 tpSize) const noexcept
inline constexpr SizeType32 localInAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline constexpr SizeType32 localOutAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline constexpr SizeType32 localInOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline constexpr SizeType32 value() const noexcept
inline constexpr std::string_view name() const noexcept
inline constexpr SizeType32 inDim() const noexcept
inline constexpr SizeType32 outDim() const noexcept
inline constexpr bool inDimFirst() const noexcept
inline constexpr bool outDimFirst() const noexcept
inline constexpr SizeType32 inTpSplitDim() const noexcept
inline constexpr SizeType32 outTpSplitDim() const noexcept
Public Static Functions
static std::vector<LoraModule> createLoraModules(std::vector<std::string> const &loraModuleNames, SizeType32 hiddenSize, SizeType32 mlpHiddenSize, SizeType32 numAttentionHeads, SizeType32 numKvAttentionHeads, SizeType32 attentionHeadSize, SizeType32 tpSize)
static inline constexpr ModuleType toModuleType(std::string_view const &name)
static inline constexpr std::string_view toModuleName(ModuleType t) noexcept
static inline constexpr std::string_view toModuleName(SizeType32 id)
Private Members
ModuleType mType
SizeType32 mInDim
SizeType32 mOutDim
bool mInDimFirst
bool mOutDimFirst
SizeType32 mInTpSplitDim
SizeType32 mOutTpSplitDim
inline explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype)
inline constexpr SizeType32 getVocabSize() const noexcept
inline constexpr SizeType32 getVocabSizePadded(SizeType32 worldSize) const noexcept
inline constexpr SizeType32 getNbAttentionLayers(SizeType32 pipelineParallelism = 1) const
inline constexpr SizeType32 getNbRnnLayers(SizeType32 pipelineParallelism = 1) const
inline constexpr SizeType32 getNbHeads() const noexcept
inline constexpr SizeType32 getNbKvHeads() const noexcept
inline constexpr void setNbKvHeads(SizeType32 nbKvHeads) noexcept
inline constexpr SizeType32 getHiddenSize() const noexcept
inline constexpr SizeType32 getSizePerHead() const noexcept
inline constexpr void setSizePerHead(SizeType32 sizePerHead) noexcept
inline constexpr bool useGptAttentionPlugin() const noexcept
inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
inline constexpr bool useMambaConv1dPlugin() const noexcept
inline constexpr void useMambaConv1dPlugin(bool useMambaConv1dPlugin) noexcept
inline constexpr bool usePackedInput() const noexcept
inline constexpr void usePackedInput(bool inputPacked) noexcept
inline constexpr bool usePagedKvCache() const noexcept
inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept
inline constexpr bool usePagedState() const noexcept
inline constexpr void usePagedState(bool pagedState) noexcept
inline constexpr SizeType32 getTokensPerBlock() const noexcept
inline constexpr void setTokensPerBlock(SizeType32 TokensPerBlock) noexcept
inline constexpr common::QuantMode getQuantMode() const noexcept
inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept
inline constexpr bool supportsInflightBatching() const noexcept
inline constexpr SizeType32 getMaxBatchSize() const noexcept
inline constexpr void setMaxBatchSize(SizeType32 maxBatchSize) noexcept
inline constexpr SizeType32 getMaxBeamWidth() const noexcept
inline constexpr void setMaxBeamWidth(SizeType32 maxBeamWidth) noexcept
inline constexpr SizeType32 getMaxInputLen() const noexcept
inline constexpr void setMaxInputLen(SizeType32 maxInputLen) noexcept
inline constexpr SizeType32 getMaxSequenceLen() const noexcept
inline constexpr void setMaxSequenceLen(SizeType32 maxSequenceLen) noexcept
inline constexpr std::optional<SizeType32> getMaxNumTokens() const noexcept
inline constexpr void setMaxNumTokens(std::optional<SizeType32> maxNumTokens) noexcept
inline constexpr bool usePromptTuning() const noexcept
inline constexpr SizeType32 getMaxPromptEmbeddingTableSize() const noexcept
inline constexpr void setMaxPromptEmbeddingTableSize(SizeType32 maxPromptEmbeddingTableSize) noexcept
inline constexpr bool computeContextLogits() const noexcept
inline constexpr void computeContextLogits(bool computeContextLogits) noexcept
inline constexpr bool computeGenerationLogits() const noexcept
inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept
inline ModelVariant getModelVariant() const
inline void setModelVariant(ModelVariant modelVariant)
inline constexpr bool useCustomAllReduce() const noexcept
inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept
inline constexpr void setMaxDraftLen(SizeType32 maxDraftLen) noexcept
inline SizeType32 getMaxDraftLen() const
inline constexpr SizeType32 getMaxTokensPerStep() const noexcept
inline constexpr void setUseContextFMHAForGeneration(bool useContextFMHAForGeneration) noexcept
inline constexpr bool getContextFMHAForGeneration() const noexcept
inline constexpr void setPagedContextFMHA(bool pagedContextFMHA) noexcept
inline constexpr bool getPagedContextFMHA() const noexcept
inline constexpr void useXQA(bool useXQA) noexcept
inline constexpr bool useXQA() const noexcept
inline constexpr bool useLoraPlugin() const noexcept
inline constexpr void useLoraPlugin(bool useLoraPlugin) noexcept
inline std::vector<LoraModule> const &getLoraModules() const noexcept
inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept
inline constexpr SizeType32 getMlpHiddenSize() const noexcept
inline constexpr void setMlpHiddenSize(SizeType32 mlpHiddenSize) noexcept
inline constexpr bool useCrossAttention() const noexcept
inline constexpr void useCrossAttention(bool newCrossAttention) noexcept
inline constexpr bool usePositionEmbedding() const noexcept
inline constexpr void usePositionEmbedding(bool newPositionEmbedding) noexcept
inline constexpr bool useTokenTypeEmbedding() const noexcept
inline constexpr void useTokenTypeEmbedding(bool newTokenTypeEmbedding) noexcept
inline constexpr SizeType32 getFfnHiddenSize() const noexcept
inline constexpr void setFfnHiddenSize(SizeType32 ffnHiddenSize) noexcept
inline constexpr SizeType32 getMaxLoraRank() const noexcept
inline constexpr void setMaxLoraRank(SizeType32 maxLoraRank) noexcept
inline constexpr bool useMedusa() const noexcept
inline std::optional<MedusaModule> getMedusaModule() const noexcept
inline void setMedusaModule(MedusaModule const &medusaModule) noexcept
inline constexpr bool isTransformerBased() const noexcept
inline bool hasRnnConfig() const noexcept
inline constexpr bool isRnnBased() const noexcept
SizeType32 mVocabSize
SizeType32 mNbAttentionLayers
SizeType32 mNbRnnLayers
SizeType32 mNbHeads
SizeType32 mNbKvHeads
SizeType32 mHiddenSize
SizeType32 mSizePerHead
bool mUseGptAttentionPlugin
bool mUseMambaConv1dPlugin
bool mInputPacked
bool mPagedKvCache
bool mPagedState
SizeType32 mTokensPerBlock
common::QuantMode mQuantMode
SizeType32 mMaxBatchSize
SizeType32 mMaxBeamWidth
SizeType32 mMaxInputLen
SizeType32 mMaxSequenceLen
std::optional<SizeType32> mMaxNumTokens
bool mComputeContextLogits
bool mComputeGenerationLogits
ModelVariant mModelVariant
bool mUseCustomAllReduce
SizeType32 mMaxPromptEmbeddingTableSize
SizeType32 mMaxDraftLen
bool mUseContextFMHAForGeneration
bool mPagedContextFMHA
bool mUseXQA
bool mUseLoraPlugin
std::vector<LoraModule> mLoraModules
SizeType32 mMlpHiddenSize
SizeType32 mMaxLoraRank
std::optional<MedusaModule> mMedusaModule
bool mUseCrossAttention
bool mUsePositionEmbedding
bool mUseTokenTypeEmbedding
SizeType32 mFfnHiddenSize
Public Types
Public Functions
inline constexpr bool isDraftModel() const
inline constexpr bool isMedusa() const
inline constexpr bool isLookaheadDecoding() const
inline constexpr bool requiresAttentionMask() const
inline constexpr bool predictsDraftTokens() const
inline constexpr bool needsKVCacheRewind() const
inline constexpr bool hasDraftLogits() const
inline bool operator==(SpeculativeDecodingMode const &other) const
Public Static Functions
static inline constexpr auto DraftModel()
static inline constexpr auto Medusa()
static inline constexpr auto LookaheadDecoding()
inline constexpr bool anyBitSet(UnderlyingType bits) const
inline constexpr bool allBitSet(UnderlyingType bits) const
UnderlyingType mState = {kNone}
static constexpr UnderlyingType kNone = {1u << 0}
static constexpr UnderlyingType kDraftModel = {1u << 1}
static constexpr UnderlyingType kMedusa = {1u << 2}
static constexpr UnderlyingType kLookaheadDecoding = {1u << 3}
