Spaces:
Running
Running
File size: 6,461 Bytes
9b1aef8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | /**
* Voice data loader — loads KittenTTS voice embeddings.
*
* Instead of parsing .npz (zip of npy), we download the npz and use
* a robust zip + npy parser with proper byte alignment handling.
*/
export interface VoiceInfo {
data: Float32Array;
shape: [number, number]; // [numStyles, styleDim]
}
function parseNpyHeader(bytes: Uint8Array) {
// Magic: \x93NUMPY
if (bytes[0] !== 0x93 || String.fromCharCode(bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]) !== "NUMPY") {
throw new Error("Not a valid .npy file");
}
const majorVersion = bytes[6];
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
let headerLen: number;
let headerOffset: number;
if (majorVersion === 1) {
headerLen = view.getUint16(8, true);
headerOffset = 10;
} else {
headerLen = view.getUint32(8, true);
headerOffset = 12;
}
const headerStr = new TextDecoder().decode(
bytes.slice(headerOffset, headerOffset + headerLen)
);
const descrMatch = headerStr.match(/'descr'\s*:\s*'([^']+)'/);
const shapeMatch = headerStr.match(/'shape'\s*:\s*\(([^)]*)\)/);
if (!descrMatch) throw new Error("Could not parse dtype from .npy header: " + headerStr);
const descr = descrMatch[1];
const shapeNums = shapeMatch
? shapeMatch[1].split(",").map((s) => parseInt(s.trim(), 10)).filter((n) => !isNaN(n))
: [];
const dataOffset = headerOffset + headerLen;
return { descr, shape: shapeNums, dataOffset };
}
function npyToFloat32(bytes: Uint8Array): { data: Float32Array; shape: number[] } {
const { descr, shape, dataOffset } = parseNpyHeader(bytes);
const rawBytes = bytes.slice(dataOffset);
// Always copy into a fresh aligned ArrayBuffer
const aligned = new ArrayBuffer(rawBytes.length);
new Uint8Array(aligned).set(rawBytes);
let data: Float32Array;
if (descr === "<f4" || descr === "float32") {
data = new Float32Array(aligned);
} else if (descr === "<f8" || descr === "float64") {
const f64 = new Float64Array(aligned);
data = new Float32Array(f64.length);
for (let i = 0; i < f64.length; i++) data[i] = f64[i];
} else {
throw new Error("Unsupported npy dtype: " + descr);
}
return { data, shape };
}
/**
* Parse a zip file and extract entries.
* Handles both stored (method 0) and deflated (method 8) entries.
* Properly handles data descriptors (bit 3 of flags).
*/
async function extractZipEntries(
buffer: ArrayBuffer
): Promise<Map<string, Uint8Array>> {
const bytes = new Uint8Array(buffer);
const view = new DataView(buffer);
const entries = new Map<string, Uint8Array>();
// First, find the Central Directory to get reliable sizes
// Search for End of Central Directory signature (0x06054b50) from the end
let eocdOffset = -1;
for (let i = bytes.length - 22; i >= 0; i--) {
if (view.getUint32(i, true) === 0x06054b50) {
eocdOffset = i;
break;
}
}
if (eocdOffset === -1) {
throw new Error("Could not find End of Central Directory");
}
const cdOffset = view.getUint32(eocdOffset + 16, true);
const cdEntries = view.getUint16(eocdOffset + 10, true);
// Parse Central Directory entries to get accurate sizes and offsets
interface CDEntry {
fileName: string;
compressedSize: number;
uncompressedSize: number;
localHeaderOffset: number;
compressionMethod: number;
}
const cdList: CDEntry[] = [];
let cdPos = cdOffset;
for (let i = 0; i < cdEntries; i++) {
const sig = view.getUint32(cdPos, true);
if (sig !== 0x02014b50) break;
const compressionMethod = view.getUint16(cdPos + 10, true);
const compressedSize = view.getUint32(cdPos + 20, true);
const uncompressedSize = view.getUint32(cdPos + 24, true);
const fileNameLen = view.getUint16(cdPos + 28, true);
const extraLen = view.getUint16(cdPos + 30, true);
const commentLen = view.getUint16(cdPos + 32, true);
const localHeaderOffset = view.getUint32(cdPos + 42, true);
const fileName = new TextDecoder().decode(
bytes.slice(cdPos + 46, cdPos + 46 + fileNameLen)
);
cdList.push({
fileName,
compressedSize,
uncompressedSize,
localHeaderOffset,
compressionMethod,
});
cdPos += 46 + fileNameLen + extraLen + commentLen;
}
// Now extract each entry using local headers + CD sizes
for (const cd of cdList) {
const lhOffset = cd.localHeaderOffset;
const lhFileNameLen = view.getUint16(lhOffset + 26, true);
const lhExtraLen = view.getUint16(lhOffset + 28, true);
const dataStart = lhOffset + 30 + lhFileNameLen + lhExtraLen;
let fileData: Uint8Array;
if (cd.compressionMethod === 0) {
// Stored
fileData = bytes.slice(dataStart, dataStart + cd.uncompressedSize);
} else if (cd.compressionMethod === 8) {
// Deflate
const compressed = bytes.slice(dataStart, dataStart + cd.compressedSize);
const ds = new DecompressionStream("deflate-raw");
const writer = ds.writable.getWriter();
writer.write(compressed);
writer.close();
const reader = ds.readable.getReader();
const chunks: Uint8Array[] = [];
let totalLen = 0;
while (true) {
const { done, value } = await reader.read();
if (done) break;
chunks.push(value);
totalLen += value.length;
}
fileData = new Uint8Array(totalLen);
let pos = 0;
for (const chunk of chunks) {
fileData.set(chunk, pos);
pos += chunk.length;
}
} else {
console.warn(`Skipping ${cd.fileName}: unsupported compression ${cd.compressionMethod}`);
continue;
}
entries.set(cd.fileName, fileData);
}
return entries;
}
/**
* Load voice embeddings from a .npz file URL.
*/
export async function loadVoices(
url: string
): Promise<Record<string, VoiceInfo>> {
const response = await fetch(url);
if (!response.ok) throw new Error(`Failed to fetch voices: ${response.status}`);
const arrayBuffer = await response.arrayBuffer();
const entries = await extractZipEntries(arrayBuffer);
const voices: Record<string, VoiceInfo> = {};
for (const [fileName, fileData] of entries) {
if (!fileName.endsWith(".npy")) continue;
const voiceName = fileName.replace(/\.npy$/, "");
const { data, shape } = npyToFloat32(fileData);
voices[voiceName] = {
data,
shape: [shape[0] || 1, shape[1] || data.length],
};
}
return voices;
}
|