token-chunk Split text into token-sized chunks for RAG/embedding
Flags
--size number default: 512
--overlap number default: 0
--encoding select: cl100k_base | o200k_base default: cl100k_base
Examples
Chunk a paragraph into embedding-sized pieces
Usage
"Machine learning is a subset of artificial intelligence. It ..." | token-chunk Split a document into overlapping chunks for RAG
Usage
"Chapter 1: Introduction
The story begins in a small town. Th..." | token-chunk Chunk SQL queries into batches for LLM analysis
Usage
"SELECT id, name FROM users WHERE active = true;
SELECT * FRO..." | token-chunk View source
async (input, opts = {})=>{
if (!input.trim()) return "(empty input)";
const { getEncoder } = await import('./tokenizer_qMtbZfTQ.mjs').then(async (m)=>{
await m.__tla;
return m;
});
const encoding = opts.encoding || "cl100k_base";
const size = Math.max(1, parseInt(opts.size, 10) || 512);
const overlap = Math.max(0, Math.min(parseInt(opts.overlap, 10) || 0, size - 1));
const enc = await getEncoder(encoding);
const ids = enc.encode(input);
const decoder = new TextDecoder("utf-8", {
fatal: false
});
const chunks = [];
const step = size - overlap;
for(let i = 0; i < ids.length; i += step){
chunks.push(decoder.decode(enc.decode(ids.slice(i, i + size))));
}
return chunks.join("\n---\n");
}