namespace tensorrt_llm
namespace executor
SizeType32 const kDefaultIterStatsMaxIterations = 1000
SizeType32 const kDefaultRequestStatsMaxIterations = 0
class SamplingConfig
- #include <executor.h>
Sampling configuration.
Public Functions
explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &randomSeed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minLength = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt)
Constructor for SamplingConfig See description of parameters below.
bool operator==(SamplingConfig const &other) const
SizeType32 getBeamWidth() const
std::optional<SizeType32> getTopK() const
std::optional<SizeType32> getTopPResetIds() const
std::optional<RandomSeedType> getRandomSeed() const
std::optional<SizeType32> getMinLength() const
std::optional<SizeType32> getEarlyStopping() const
Private Members
SizeType32 mBeamWidth
The beam width. Default is 1 which disables beam search.
std::optional<SizeType32> mTopK
Controls number of logits to sample from. Default is 0 (all logits).
std::optional<FloatType> mTopPMin
Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.
std::optional<TokenIdType> mTopPResetIds
Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.
std::optional<FloatType> mTopPDecay
Controls decay in the top-P algorithm. The decay value. Default is 1.f.
std::optional<RandomSeedType> mRandomSeed
Controls the random seed used by the random number generator in sampling.
std::optional<FloatType> mTemperature
Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.
std::optional<SizeType32> mMinLength
Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.
std::optional<FloatType> mRepetitionPenalty
Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.
std::optional<FloatType> mPresencePenalty
Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
std::optional<FloatType> mFrequencyPenalty
Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
std::optional<FloatType> mLengthPenalty
Controls how to penalize longer sequences in beam search. Default is 0.f.
std::optional<SizeType32> mEarlyStopping
Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)
- friend class Serialization
explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &randomSeed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minLength = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt)
class OutputConfig
- #include <executor.h>
Configuration that controls the outputs of a Result.
Public Functions
explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false)
explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false)
class SpeculativeDecodingConfig
- #include <executor.h>
Configuration for speculative decoding. Allows to include draft tokens, draft logits and specify acceptance threshold.
Public Functions
Private Members
- friend class Serialization
class PromptTuningConfig
- #include <executor.h>
Configuration for prompt tuning.
Public Functions
Private Members
- friend class Serialization
class LoraConfig
- #include <executor.h>
Configuration for LoRA.
Public Functions
Private Members
- friend class Serialization
class Request
- #include <executor.h>
A class that holds information about the request.
Public Functions
Request(VecTokens inputTokenIds, SizeType32 maxNewTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<SpeculativeDecodingConfig> speculativeDecodingConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt)
The Request constructor.
- Parameters:
inputTokenIds – The input token ids
maxNewTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling configuration
outputConfig – The output configuration
endId – The end token id
padId – The pad token id
badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens
stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens
embeddingBias – The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]
speculativeDecodingConfig – The speculative decoding configuration
pTuningConfig – The prompt tuning configuration
loraConfig – The LoRA configuration
logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig.
SizeType32 getMaxNewTokens() const
bool getStreaming() const
SamplingConfig getSamplingConfig() const
OutputConfig getOutputConfig() const
std::optional<SizeType32> getEndId() const
std::optional<SizeType32> getPadId() const
std::optional<SpeculativeDecodingConfig> getSpeculativeDecodingConfig() const
std::optional<PromptTuningConfig> getPromptTuningConfig() const
std::optional<LoraConfig> getLoraConfig() const
std::optional<std::string> getLogitsPostProcessorName() const
void setStreaming(bool streaming)
void setSamplingConfig(SamplingConfig const &config)
void setOutputConfig(OutputConfig const &outputConfig)
void setEndId(SizeType32 endId)
void setPadId(SizeType32 padId)
void setSpeculativeDecodingConfig(SpeculativeDecodingConfig const &specDecodingConfig)
void setPromptTuningConfig(PromptTuningConfig const &pTuningConfig)
void setLoraConfig(LoraConfig const &loraConfig)
void setLogitsPostProcessorName(std::string const &logitsPostProcessorName)
Private Members
std::unique_ptr<Impl> mImpl
- friend class Serialization
Request(VecTokens inputTokenIds, SizeType32 maxNewTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<SpeculativeDecodingConfig> speculativeDecodingConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt)
struct Result
- #include <executor.h>
Struct that holds the generation result.
Public Members
bool isFinal
Indicates if this is the final result for the request.
BeamTokens outputTokenIds
The output tokens for each beam.
std::optional<VecLogProbs> cumLogProbs
The cumulative log probabilities. Size beamSize.
std::optional<std::vector<VecLogProbs>> logProbs
The log probabilities for each generated token. Size [beamSize, outputLen].
bool isFinal
class Response
- #include <executor.h>
Class that holds either an error or a result.
Public Functions
bool hasError() const
Indicates if this response has an error or not.
std::string getErrorMsg() const
Get the error msg for this response Will throw an exception if hasError is false.
Private Members
std::unique_ptr<Impl> mImpl
- friend class Serialization
class SchedulerConfig
- #include <executor.h>
Configuration class for the scheduler.
Public Functions
explicit SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt)
CapacitySchedulerPolicy getCapacitySchedulerPolicy() const
std::optional<ContextChunkingPolicy> getContextChunkingPolicy() const
Private Members
CapacitySchedulerPolicy mCapacitySchedulerPolicy
The capacity scheduler policy. See CapacitySchedulerPolicy.
std::optional<ContextChunkingPolicy> mContextChunkingPolicy
The context chunking policy. See ContextChunkingPolicy.
- friend class Serialization
explicit SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt)
class KvCacheConfig
- #include <executor.h>
Configuration class for the KV cache.
Public Functions
explicit KvCacheConfig(bool enableBlockReuse = false, std::optional<SizeType32> const &maxTokens = std::nullopt, std::optional<SizeType32> const &maxAttentionWindow = std::nullopt, std::optional<SizeType32> const &sinkTokenLength = std::nullopt, std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt, bool onboardBlocks = true)
bool getEnableBlockReuse() const
std::optional<SizeType32> getMaxTokens() const
std::optional<SizeType32> getMaxAttentionWindow() const
std::optional<SizeType32> getSinkTokenLength() const
std::optional<size_t> getHostCacheSize() const
bool getOnboardBlocks() const
Private Members
bool mEnableBlockReuse
Controls if KV cache blocks can be reused for different requests.
std::optional<SizeType32> mMaxTokens
The maximum number of tokens that should be stored in the KV cache If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.
std::optional<SizeType32> mMaxAttentionWindow
Size of the attention window for each sequence. Only the last mMaxAttentionWindow tokens of each sequence will be stored in the KV cache.
std::optional<SizeType32> mSinkTokenLength
Number of sink tokens (tokens to always keep in attention window)
std::optional<FloatType> mFreeGpuMemoryFraction
The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.
std::optional<size_t> mHostCacheSize
Size of secondary memory pool in bytes. Default is 0. Having a secondary memory pool increases KV cache block reuse potential.
bool mOnboardBlocks
Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
- friend class Serialization
explicit KvCacheConfig(bool enableBlockReuse = false, std::optional<SizeType32> const &maxTokens = std::nullopt, std::optional<SizeType32> const &maxAttentionWindow = std::nullopt, std::optional<SizeType32> const &sinkTokenLength = std::nullopt, std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt, bool onboardBlocks = true)
class OrchestratorConfig
class ParallelConfig
- #include <executor.h>
A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.
Public Functions
explicit ParallelConfig(CommunicationType commType = CommunicationType::kMPI, CommunicationMode commMode = CommunicationMode::kLEADER, std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, std::optional<std::vector<SizeType32>> participantIds = std::nullopt, std::optional<OrchestratorConfig> const &orchestratorConfig = std::nullopt)
- Parameters:
commType – The communication type. See CommunicationType.
commMode – The communication mode. See CommunicationMode.
deviceIds – The IDs of the GPUs involved in the execution of the model
participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.
CommunicationType getCommunicationType() const
CommunicationMode getCommunicationMode() const
std::optional<std::vector<SizeType32>> getDeviceIds() const
std::optional<std::vector<SizeType32>> getParticipantIds() const
std::optional<OrchestratorConfig> getOrchestratorConfig() const
void setCommunicationType(CommunicationType type)
void setCommunicationMode(CommunicationMode mode)
void setDeviceIds(std::vector<SizeType32> const &deviceIds)
void setParticipantIds(std::vector<SizeType32> const &participantIds)
void setOrchestratorConfig(OrchestratorConfig const &orchestratorConfig)
Private Members
CommunicationType mCommType
The type of communication protocol used. Default is MPI.
CommunicationMode mCommMode
The mode of communication. See CommunicationMode.
std::optional<std::vector<SizeType32>> mDeviceIds
The GPU device ids to use for executing this model.
std::optional<std::vector<SizeType32>> mParticipantIds
The participant ids (MPI ranks for example) used for executing this model.
std::optional<OrchestratorConfig> mOrchestratorConfig
Optional orchestrator configuration.
- friend class Serialization
explicit ParallelConfig(CommunicationType commType = CommunicationType::kMPI, CommunicationMode commMode = CommunicationMode::kLEADER, std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, std::optional<std::vector<SizeType32>> participantIds = std::nullopt, std::optional<OrchestratorConfig> const &orchestratorConfig = std::nullopt)
class PeftCacheConfig
- #include <executor.h>
config for PeftCacheManager
Public Functions
explicit PeftCacheConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0, SizeType32 optimalAdapterSize = 8, SizeType32 maxAdapterSize = 64, SizeType32 numPutWorkers = 1, SizeType32 numEnsureWorkers = 1, SizeType32 numCopyStreams = 1, SizeType32 maxPagesPerBlockHost = 24, SizeType32 maxPagesPerBlockDevice = 8, std::optional<float> const &deviceCachePercent = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt)
bool operator==(PeftCacheConfig const &other) const
SizeType32 getNumHostModuleLayer() const
SizeType32 getNumDeviceModuleLayer() const
SizeType32 getOptimalAdapterSize() const
SizeType32 getMaxAdapterSize() const
SizeType32 getNumPutWorkers() const
SizeType32 getNumEnsureWorkers() const
SizeType32 getNumCopyStreams() const
SizeType32 getMaxPagesPerBlockHost() const
SizeType32 getMaxPagesPerBlockDevice() const
std::optional<float> getDeviceCachePercent() const
std::optional<size_t> getHostCacheSize() const
Private Members
SizeType32 mNumHostModuleLayer
SizeType32 mNumDeviceModuleLayer
SizeType32 mOptimalAdapterSize
SizeType32 mMaxAdapterSize
SizeType32 mNumPutWorkers
SizeType32 mNumEnsureWorkers
SizeType32 mNumCopyStreams
SizeType32 mMaxPagesPerBlockHost
SizeType32 mMaxPagesPerBlockDevice
std::optional<size_t> mHostCacheSize
- friend class Serialization
explicit PeftCacheConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0, SizeType32 optimalAdapterSize = 8, SizeType32 maxAdapterSize = 64, SizeType32 numPutWorkers = 1, SizeType32 numEnsureWorkers = 1, SizeType32 numCopyStreams = 1, SizeType32 maxPagesPerBlockHost = 24, SizeType32 maxPagesPerBlockDevice = 8, std::optional<float> const &deviceCachePercent = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt)
class ExecutorConfig
- #include <executor.h>
Configuration class for the model executor.
Public Functions
explicit ExecutorConfig(SizeType32 maxBeamWidth = 1, SchedulerConfig const &schedulerConfig = SchedulerConfig(), KvCacheConfig const &kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = false, bool normalizeLogProbs = true, SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, BatchingType batchingType = BatchingType::kINFLIGHT, std::optional<ParallelConfig> parallelConfig = std::nullopt, std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt, std::optional<LogitsPostProcessorMap> logitsPostProcessorMap = std::nullopt, std::optional<MedusaChoices> medusaChoices = std::nullopt, std::optional<DecodingMode> decodingMode = std::nullopt, float gpuWeightsPercent = 1)
SizeType32 getMaxBeamWidth() const
SchedulerConfig getSchedulerConfig() const
KvCacheConfig getKvCacheConfig() const
bool getEnableChunkedContext() const
bool getNormalizeLogProbs() const
SizeType32 getIterStatsMaxIterations() const
SizeType32 getRequestStatsMaxIterations() const
BatchingType getBatchingType() const
std::optional<ParallelConfig> getParallelConfig() const
std::optional<PeftCacheConfig> getPeftCacheConfig() const
std::optional<LogitsPostProcessorMap> getLogitsPostProcessorMap() const
std::optional<MedusaChoices> getMedusaChoices() const
std::optional<DecodingMode> getDecodingMode() const
float getGpuWeightsPercent() const
void setMaxBeamWidth(SizeType32 maxBeamWidth)
void setSchedulerConfig(SchedulerConfig const &schedulerConfig)
void setKvCacheConfig(KvCacheConfig const &kvCacheConfig)
void setEnableChunkedContext(bool enableChunkedContext)
void setNormalizeLogProbs(bool normalizeLogProbs)
void setIterStatsMaxIterations(SizeType32 iterStatsMaxIterations)
void setRequestStatsMaxIterations(SizeType32 requestStatsMaxIterations)
void setBatchingType(BatchingType batchingType)
void setParallelConfig(ParallelConfig const ¶llelConfig)
void setPeftCacheConfig(PeftCacheConfig const &peftCacheConfig)
void setLogitsPostProcessorMap(LogitsPostProcessorMap const &logitsPostProcessorMap)
void setMedusaChoices(MedusaChoices const &medusaChoices)
void setDecodingMode(DecodingMode decodingMode)
void setGpuWeightsPercent(float const &gpuWeightsPercent)
Private Members
SizeType32 mMaxBeamWidth
The beam width value of requests that will be sent to the executor.
SchedulerConfig mSchedulerConfig
The scheduler configuration.
KvCacheConfig mKvCacheConfig
The KV cache configuration.
bool mEnableChunkedContext
The KV cache configuration.
bool mNormalizeLogProbs
Controls if log probabilities should be normalized or not.
SizeType32 mIterStatsMaxIterations
Controls the maximum number of iterations for which to keep statistics.
SizeType32 mRequestStatsMaxIterations
Controls the maximum number of iterations for which to keep per-request statistics.
BatchingType mBatchingType
The type of batching strategy to use. See BatchingType.
std::optional<ParallelConfig> mParallelConfig
The parallel execution configuration.
std::optional<PeftCacheConfig> mPeftCacheConfig
std::optional<LogitsPostProcessorMap> mLogitsPostProcessorMap
std::optional<MedusaChoices> mMedusaChoices
std::optional<DecodingMode> mDecodingMode
float mGpuWeightsPercent
- friend class Serialization
explicit ExecutorConfig(SizeType32 maxBeamWidth = 1, SchedulerConfig const &schedulerConfig = SchedulerConfig(), KvCacheConfig const &kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = false, bool normalizeLogProbs = true, SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, BatchingType batchingType = BatchingType::kINFLIGHT, std::optional<ParallelConfig> parallelConfig = std::nullopt, std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt, std::optional<LogitsPostProcessorMap> logitsPostProcessorMap = std::nullopt, std::optional<MedusaChoices> medusaChoices = std::nullopt, std::optional<DecodingMode> decodingMode = std::nullopt, float gpuWeightsPercent = 1)
class Executor
- #include <executor.h>
The executor is responsible for receiving new requests and sending responses, and running the inference.
Public Functions
Executor(std::filesystem::path const &modelPath, ModelType modelType, ExecutorConfig const &executorConfig)
- Parameters:
modelPath – Path to the folder that defines the model to run
modelType – The type of model
executorConfig – The configuration for the executor
comm – An optional inter-process communicator configuration
Executor(std::vector<uint8_t> const &engineBuffer, std::string const &jsonConfigStr, ModelType modelType, ExecutorConfig const &executorConfig)
IdType enqueueRequest(Request const &request)
Enqueue a new request.
- Parameters:
request – The LLM request which contains input tokens and request parameters
- Returns:
A unique id that identifies the request
std::vector<IdType> enqueueRequests(std::vector<Request> const &requests)
Enqueue a batch of request.
std::vector<Response> awaitResponses(std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)
Await for ready responses.
This overload awaits for any ready responses. In particular, if several requests have been enqueued, this method will provide any ready responses without order guarantees.
- Parameters:
timeout – The maximum time to wait for new responses
- Returns:
A vector of responses
std::vector<Response> awaitResponses(IdType const &requestId, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)
Await for ready responses.
- Parameters:
id – A request id
timeout – The maximum time to wait for new responses
- Returns:
A vector of responses
std::vector<std::vector<Response>> awaitResponses(std::vector<IdType> const &requestIds, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)
Await for multiple ready responses.
A multiple ID request behaves as if awaitResponses(IdType, timeout) were invoked on all IDs. The returned vector contains a vector of responses per ID in the same order specified by the requestIds. The same behaviour as awaitResponses(IdType, timeout) applies: * Responses may be empty. * If all responses have already been given for one of the requestIds, then this method will hang unless a timeout is specified.
- Parameters:
requestIds – Ids requested
timeout – The maximum time to wait for new responses
- Returns:
A vector of vector of responses
SizeType32 getNumResponsesReady(std::optional<IdType> const &requestId = std::nullopt) const
Get the number of ready responses.
- Parameters:
requestId – An optional request id
- Returns:
The number of ready responses
void cancelRequest(IdType requestId)
Cancel the request with provided request id.
- Parameters:
id – The request id for which to cancel the response
void shutdown()
Signals the server to shutdown This call is blocking. Only returns when all requests have terminated or timeout has been reached.
std::deque<IterationStats> getLatestIterationStats()
Returns the per-iterations statistics computed since last call to getLatestIterationStats Contains at most iterStatsMaxIterations iterations.
- Returns:
Iteration stats
std::deque<RequestStatsPerIteration> getLatestRequestStats()
Returns the request stats of each iteration computed since last call to getLatestRequestStats Contains at most requestStatsMaxIterations iterations.
- Returns:
Request stats grouped by iterations
bool canEnqueueRequests() const
Indicates if the current process is allowed to enqueueRequests.
Private Members
std::unique_ptr<Impl> mImpl
Executor(std::filesystem::path const &modelPath, ModelType modelType, ExecutorConfig const &executorConfig)
class JsonSerialization
- #include <executor.h>
Class with utility functions to serialize statistics to json string.
Public Static Functions
static std::string toJsonStr(IterationStats const &iterationStats)
Utility function to convert an iterationStats struct to a json serialized string.
static std::string toJsonStr(RequestStatsPerIteration const &requestStatsPerIter)
Utility function to convert a requestStatsPerIteration struct to a json serialized string.
static std::string toJsonStr(RequestStats const &requestStats)
Utility function to convert a requestStats struct to a json serialized string.
static std::string toJsonStr(IterationStats const &iterationStats)
SizeType32 const kDefaultIterStatsMaxIterations = 1000
namespace mpi
namespace executor
namespace tensorrt_llm
namespace executor
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>
Public Types
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>
class Tensor
Public Types
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>
Public Functions
Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToGpu(Tensor::CudaStreamPtr stream) const
Tensor() noexcept = default
~Tensor() = default
void *getData()
Returns a pointer to underlying array.
void const *getData() const
Returns a pointer to underlying array.
MemoryType getMemoryType() const
Returns the memory type of the buffer.
std::size_t getSize() const
Returns the number of elements in the tensor.
std::size_t getSizeInBytes() const
Returns the size of the tensor in bytes.
void setZero(CudaStreamPtr stream = nullptr)
Set the entire memory to zero.
- Parameters:
stream – Must be a valid CUDA stream if the memory type is GPU.
void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)
Copy the data and shape from another tensor.
- Parameters:
other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
inline explicit operator bool() const
Public Static Functions
static Tensor cpu(DataType dataType, Shape shape = {})
Allocate a cpu tensor with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
static Tensor pinned(DataType dataType, Shape shape = {})
Allocate a cpu tensor in pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
static Tensor pooledPinned(DataType dataType, Shape shape = {})
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
static Tensor managed(DataType dataType, Shape shape = {})
Allocate a tensor in managed memory (UVM) with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
static Tensor gpu(DataType dataType, CudaStreamPtr stream, Shape shape = {})
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
- Parameters:
shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
template<typename T>
static inline Tensor gpu(CudaStreamPtr stream, Shape shape = {})
static Tensor of(DataType dataType, void *data, Shape shape)
Wrap a data pointer into a tensor without taking ownership.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>
namespace runtime
namespace executor
struct TypeTraits<std::int8_t>
struct TypeTraits<std::int32_t>
struct TypeTraits<std::int64_t>
struct TypeTraits<std::uint8_t>
namespace tensorrt_llm
namespace executor
using SizeType32 = std::int32_t
using FloatType = float
using TokenIdType = std::int32_t
using VecTokens = std::vector<TokenIdType>
using IdType = std::uint64_t
using IterationType = std::uint64_t
using RandomSeedType = std::uint64_t
using StreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>
using LogitsPostProcessor = std::function<void(IdType, Tensor&, BeamTokens const&, StreamPtr&)>
using LogitsPostProcessorMap = std::unordered_map<std::string, LogitsPostProcessor>
using MedusaChoices = std::vector<std::vector<SizeType32>>
enum class DataType
enumerator kBOOL
enumerator kUINT8
enumerator kINT8
enumerator kINT32
enumerator kINT64
enumerator kBF16
enumerator kFP8
enumerator kFP16
enumerator kFP32
enumerator kUNKNOWN
enumerator kBOOL
enum class MemoryType
enumerator kCPU
enumerator kCPU_PINNED
enumerator kGPU
enumerator kUVM
enumerator kUNKNOWN
enumerator kCPU
enum class BatchingType
The batching type.
enumerator kSTATIC
STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.
enumerator kINFLIGHT
INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.
enumerator kSTATIC
enum class CapacitySchedulerPolicy
The policy used to select the subset of available requests in each iteration of the executor generation loop.
MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.
GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.
enum class ContextChunkingPolicy
Sequential chunking, complete the unfinished context phase first.
enumerator kEQUAL_PROGRESS
Iterate through each context request in sequence and attempt to increase its chunk count until the constraint is exceeded.
enum class RequestStage
Enum class that represents the state of a request.
enumerator kQUEUED
Request that have been received but not yet included in the active requests (due to constraints such as maximum batch size for example).
Active request in context phase.
Active request in generation phase.
Active request for which generation has completed.
enumerator kQUEUED
enum class DecodingMode
Decoding mode.
enumerator kNONE
No mode specified. Config will be determined from the beam width of the first request at runtime TopKTopP if beamWidth == 1, BeamSearch otherwise.
enumerator kTOP_K
enumerator kTOP_P
enumerator kBEAM_SEARCH
enumerator kMEDUSA
enumerator kTOP_K_TOP_P
enumerator kNONE
std::ostream &operator<<(std::ostream &os, CapacitySchedulerPolicy policy)
std::ostream &operator<<(std::ostream &os, ContextChunkingPolicy policy)
template<typename T, bool = false>
struct TypeTraits - #include <types.h>
For converting a C++ data type to a
struct TypeTraits<float>
struct TypeTraits<half>
- template<> int8_t >
- template<> int32_t >
- template<> int64_t >
struct TypeTraits<bool>
- template<> uint8_t >
struct KvCacheStats
- #include <types.h>
Struct that holds the stats of a KV cache manager.
Public Members
SizeType32 maxNumBlocks
Max number of blocks.
SizeType32 freeNumBlocks
Number of free blocks.
SizeType32 usedNumBlocks
Number of used blocks.
SizeType32 tokensPerBlock
Number of tokens per block.
SizeType32 maxNumBlocks
struct StaticBatchingStats
- #include <types.h>
Struct that holds the stats of static batching models for a single iteration.
Public Members
SizeType32 numScheduledRequests
Number of scheduled requests.
SizeType32 numContextRequests
Number of requests in context stage.
SizeType32 numCtxTokens
Total number of context tokens in the iteration.
SizeType32 numGenTokens
Total number of tokens to generate in the iteration.
SizeType32 emptyGenSlots
Total number of unused generation token slots.
SizeType32 numScheduledRequests
struct InflightBatchingStats
- #include <types.h>
Struct that holds the stats of inflight batching models for a single iteration.
Public Members
SizeType32 numScheduledRequests
Number of scheduled requests.
SizeType32 numContextRequests
Number of requests in context stage.
SizeType32 numGenRequests
Number of requests in generation stage.
SizeType32 numPausedRequests
Number of paused requests.
SizeType32 numCtxTokens
Total number of context tokens in the iteration.
SizeType32 microBatchId
Index of mirco batch.
SizeType32 numScheduledRequests
struct IterationStats
- #include <types.h>
Struct that holds the stats of a single iteration.
Public Members
std::string timestamp
Ending time of this iteration.
IterationType iter
Iteration id.
SizeType32 numActiveRequests
Number of active requests.
SizeType32 maxNumActiveRequests
Number of max active requests.
size_t gpuMemUsage
GPU memory usage in bytes.
size_t cpuMemUsage
CPU memory usage in bytes.
size_t pinnedMemUsage
Pinned memory usage in bytes.
std::optional<KvCacheStats> kvCacheStats
Stats specific to KV caches.
std::optional<StaticBatchingStats> staticBatchingStats
Stats specific to static batching.
std::optional<InflightBatchingStats> inflightBatchingStats
Stats specific to inflight batching.
std::string timestamp
struct RequestStats
- #include <types.h>
Struct that holds the stats of a single request.
Public Members
RequestStage stage
The current stage the request is in.
SizeType32 contextPrefillPosition
If using chunked context, the current context prefill position.
SizeType32 numGeneratedTokens
The number of generated tokens so far.
bool scheduled
Whether the request is scheduled for the current iteration.
bool paused
Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)
RequestStage stage
struct RequestStatsPerIteration
- #include <types.h>
Struct that holds the stats of all requests in an iteration.
Public Members
IterationType iter
The iteration id for these stats.
std::vector<RequestStats> requestStats
The stats of all active requests for this iteration.
IterationType iter
using SizeType32 = std::int32_t
namespace runtime
namespace executor
namespace tensorrt_llm
namespace executor
class Serialization
Public Static Functions
static SamplingConfig deserializeSamplingConfig(std::istream &is)
static void serialize(SamplingConfig const &config, std::ostream &os)
static size_t serializedSize(SamplingConfig const &config)
static OutputConfig deserializeOutputConfig(std::istream &is)
static void serialize(OutputConfig const &config, std::ostream &os)
static size_t serializedSize(OutputConfig const &config)
static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig(std::istream &is)
static void serialize(SpeculativeDecodingConfig const &config, std::ostream &os)
static size_t serializedSize(SpeculativeDecodingConfig const &config)
static PromptTuningConfig deserializePromptTuningConfig(std::istream &is)
static void serialize(PromptTuningConfig const &config, std::ostream &os)
static size_t serializedSize(PromptTuningConfig const &config)
static LoraConfig deserializeLoraConfig(std::istream &is)
static void serialize(LoraConfig const &config, std::ostream &os)
static size_t serializedSize(LoraConfig const &config)
static KvCacheConfig deserializeKvCacheConfig(std::istream &is)
static void serialize(KvCacheConfig const &kvCacheConfig, std::ostream &os)
static size_t serializedSize(KvCacheConfig const &kvCacheConfig)
static SchedulerConfig deserializeSchedulerConfig(std::istream &is)
static void serialize(SchedulerConfig const &schedulerConfig, std::ostream &os)
static size_t serializedSize(SchedulerConfig const &schedulerConfig)
static ParallelConfig deserializeParallelConfig(std::istream &is)
static void serialize(ParallelConfig const ¶llelConfig, std::ostream &os)
static size_t serializedSize(ParallelConfig const ¶llelConfig)
static PeftCacheConfig deserializePeftCacheConfig(std::istream &is)
static void serialize(PeftCacheConfig const &peftCacheConfig, std::ostream &os)
static size_t serializedSize(PeftCacheConfig const &peftCacheConfig)
static OrchestratorConfig deserializeOrchestratorConfig(std::istream &is)
static void serialize(OrchestratorConfig const &orchestratorConfig, std::ostream &os)
static size_t serializedSize(OrchestratorConfig const &orchestratorConfig)
static ExecutorConfig deserializeExecutorConfig(std::istream &is)
static void serialize(ExecutorConfig const &executorConfig, std::ostream &os)
static size_t serializedSize(ExecutorConfig const &executorConfig)
static std::string deserializeString(std::istream &is)
static bool deserializeBool(std::istream &is)
static SamplingConfig deserializeSamplingConfig(std::istream &is)
class Serialization
namespace executor