Polyphase Game Engine
Loading...
Searching...
No Matches
TinyLLMTokenizerAsset.h
Go to the documentation of this file.
1
6#pragma once
7
8#include "Asset.h"
9#include <vector>
10#include <string>
11
17{
18public:
19
21
23 virtual ~TinyLLMTokenizerAsset();
24
25 // Asset interface
26 virtual void Create() override;
27 virtual void Destroy() override;
28 virtual void LoadStream(Stream& stream, Platform platform) override;
29 virtual void SaveStream(Stream& stream, Platform platform) override;
30 virtual bool Import(const std::string& path, ImportOptions* options) override;
31 virtual void GatherProperties(std::vector<Property>& outProps) override;
32 virtual glm::vec4 GetTypeColor() override;
33 virtual const char* GetTypeName() override;
34 virtual const char* GetTypeImportExt() override;
35
36 // Tokenizer access
37 const std::vector<std::string>& GetVocab() const { return mVocab; }
38 const std::vector<float>& GetVocabScores() const { return mVocabScores; }
39 uint32_t GetMaxTokenLength() const { return mMaxTokenLength; }
40 int32_t GetVocabSize() const { return (int32_t)mVocab.size(); }
41
42 // Tokenization
43 struct TokenIndex {
44 const char* str;
45 int id;
46 };
47 int StrLookup(const char* str);
48 void Encode(const char* text, bool addBos, bool addEos, std::vector<int32_t>& outTokens);
49 std::string Decode(int prevToken, int token);
50
51protected:
52
53 static bool HandlePropChange(Datum* datum, uint32_t index, const void* newValue);
54
55 void BuildSortedVocab();
56 void InitBytePieces();
57
58 // Tokenizer data
59 std::vector<std::string> mVocab;
60 std::vector<float> mVocabScores;
61 uint32_t mMaxTokenLength = 0;
62
63 // Sorted vocab for fast lookup (built lazily)
64 std::vector<TokenIndex> mSortedVocab;
65 bool mSortedVocabBuilt = false;
66
67 // Byte-level fallback tokens
68 char mBytePieces[512];
69};
Platform
Definition EngineTypes.h:31
#define POLYPHASE_API
Definition PolyphaseAPI.h:31
Definition Asset.h:113
virtual bool Import(const std::string &path, ImportOptions *options=nullptr)
Definition Asset.cpp:244
virtual void Create()
Definition Asset.cpp:77
virtual void SaveStream(Stream &stream, Platform platform)
Definition Asset.cpp:236
virtual glm::vec4 GetTypeColor()
Definition Asset.cpp:254
virtual const char * GetTypeImportExt()
Definition Asset.cpp:264
virtual const char * GetTypeName()
Definition Asset.cpp:259
virtual void GatherProperties(std::vector< Property > &outProps) override
Definition Asset.cpp:249
virtual void LoadStream(Stream &stream, Platform platform)
Definition Asset.cpp:222
virtual void Destroy()
Definition Asset.cpp:87
Definition Datum.h:164
Definition Asset.h:102
Definition Stream.h:21
Asset containing tokenizer vocabulary for TinyLLM models.
Definition TinyLLMTokenizerAsset.h:17
std::vector< std::string > mVocab
Definition TinyLLMTokenizerAsset.h:59
uint32_t GetMaxTokenLength() const
Definition TinyLLMTokenizerAsset.h:39
std::vector< float > mVocabScores
Definition TinyLLMTokenizerAsset.h:60
const std::vector< float > & GetVocabScores() const
Definition TinyLLMTokenizerAsset.h:38
DECLARE_ASSET(TinyLLMTokenizerAsset, Asset)
const std::vector< std::string > & GetVocab() const
Definition TinyLLMTokenizerAsset.h:37
int32_t GetVocabSize() const
Definition TinyLLMTokenizerAsset.h:40
std::vector< TokenIndex > mSortedVocab
Definition TinyLLMTokenizerAsset.h:64
Definition TinyLLMTokenizerAsset.h:43
int id
Definition TinyLLMTokenizerAsset.h:45
const char * str
Definition TinyLLMTokenizerAsset.h:44