Polyphase Game Engine
Loading...
Searching...
No Matches
TinyLLMManager.h
Go to the documentation of this file.
1
6#pragma once
7
8#include "PolyphaseAPI.h"
10#include <string>
11#include <vector>
12#include <stdint.h>
13
19{
20public:
21 static TinyLLMManager* Get();
22 static void Create();
23 static void Destroy();
24
25 // Model lifecycle
26 bool LoadModel(TinyLLMAsset* asset, int32_t maxSeqLen = 0);
27 void UnloadModel();
28 bool IsModelLoaded() const;
29 TinyLLMAsset* GetModel() const;
30
31 // Low-level inference
32 void Reset();
33 float* Forward(int32_t token, int32_t pos);
34 int32_t Sample(float temperature = 1.0f, float topP = 0.9f);
35
36 // High-level generation
37 std::string Generate(const std::string& prompt, int32_t maxTokens,
38 float temperature = 1.0f, float topP = 0.9f);
39
40 // Tokenization (uses engine's TinyLLMAsset directly - no DLL issues)
41 std::vector<int32_t> Encode(const std::string& text, bool addBos = true);
42 std::string Decode(int32_t prevToken, int32_t token);
43
44 // Streaming support
45 bool BeginGenerate(const std::string& prompt, int32_t maxTokens,
46 float temperature = 1.0f, float topP = 0.9f);
47 std::string ContinueGenerate();
48 bool IsGenerating() const;
49 void Abort();
50
51 // Stats
52 float GetLastTokPerSec() const;
53 int32_t GetPosition() const;
54 int32_t GetMaxSeqLen() const;
55
56 // Platform-specific defaults
57 static int32_t GetDefaultMaxSeqLen();
58
59 // Sampler state (public for comparison function)
60 struct ProbIndex {
61 float prob;
62 int index;
63 };
64
65private:
68
69 void AllocateRunState(int32_t maxSeqLen);
70 void FreeRunState();
71 void SetupRunStatePointers();
72 int32_t SampleInternal(float* logits, float temperature, float topP);
73
74 static TinyLLMManager* sInstance;
75
76 TinyLLMAsset* mModel = nullptr;
77
78 // RunState buffers
79 float* mRunStateBuffer = nullptr;
80 size_t mRunStateSize = 0;
81
82 // Pointers into RunState
83 float* mX = nullptr;
84 float* mXb = nullptr;
85 float* mXb2 = nullptr;
86 float* mHb = nullptr;
87 float* mHb2 = nullptr;
88 float* mQ = nullptr;
89 float* mKeyCache = nullptr;
90 float* mValueCache = nullptr;
91 float* mAtt = nullptr;
92 float* mLogits = nullptr;
93
94 std::vector<ProbIndex> mProbIndex;
95 uint64_t mRngState = 0;
96
97 // Sequence state
98 int32_t mPos = 0;
99 int32_t mMaxSeqLen = 0;
100
101 // Streaming generation state
102 bool mIsGenerating = false;
103 std::vector<int32_t> mPromptTokens;
104 int32_t mPromptIdx = 0;
105 int32_t mLastToken = 0;
106 int32_t mGeneratedCount = 0;
107 int32_t mMaxGenTokens = 0;
108 float mTemperature = 1.0f;
109 float mTopP = 0.9f;
110
111 // Stats
112 float mLastTokPerSec = 0.0f;
113 int64_t mGenStartTime = 0;
114};
Export macros for Polyphase Engine symbols.
#define POLYPHASE_API
Definition PolyphaseAPI.h:31
Asset type for ultra-tiny LLM models (llama2.c format).
Asset containing a tiny LLM model and tokenizer.
Definition TinyLLMAsset.h:43
Singleton that manages LLM inference state.
Definition TinyLLMManager.h:19
Definition TinyLLMManager.h:60
int index
Definition TinyLLMManager.h:62
float prob
Definition TinyLLMManager.h:61