diff options
author | dan <[email protected]> | 2023-06-07 21:19:21 -0400 |
---|---|---|
committer | dan <[email protected]> | 2023-06-07 21:19:21 -0400 |
commit | 5a5e2dd05623aad71d8343a5de0480cd8756f85b (patch) | |
tree | 7357cde29f61eb43fb8fbee49b4c452f953742e0 /semanticsearchscratchpad/create-embeddings.js | |
parent | ace13cf4aa724c078a6a6f36a6f243cd4821a548 (diff) | |
download | dump-5a5e2dd05623aad71d8343a5de0480cd8756f85b.tar.gz dump-5a5e2dd05623aad71d8343a5de0480cd8756f85b.tar.bz2 dump-5a5e2dd05623aad71d8343a5de0480cd8756f85b.zip |
proto: working sem search with embedding done by workers
Diffstat (limited to 'semanticsearchscratchpad/create-embeddings.js')
-rw-r--r-- | semanticsearchscratchpad/create-embeddings.js | 84 |
1 files changed, 63 insertions, 21 deletions
diff --git a/semanticsearchscratchpad/create-embeddings.js b/semanticsearchscratchpad/create-embeddings.js index bbb3a81..2d2c321 100644 --- a/semanticsearchscratchpad/create-embeddings.js +++ b/semanticsearchscratchpad/create-embeddings.js @@ -1,30 +1,72 @@ -onmessage = msg => { - const lines = msg.data; - - const request = indexedDB.open("embeddings"); - - request.onupgradeneeded = (event) => { - console.log('onupgradeneeded') - const db = event.target.result; - db.createObjectStore('embeddings', {autoIncrement: true}); - }; - - let embeddings; - request.onsuccess = (event) => { - const db = event.target.result; - embeddings = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); + +//const yemb = (await emb('I really like curry.')).data; +// +// +//console.log('out', xsembs.map(x => ({value:x.value, similarity: cosineSimilarity(x.embeddings, yemb)}))); +// +//import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]'; + +//importScripts('./embeddings-lib.js'); + +import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]') + .then(({pipeline, env}) => { + +env.allowLocalModels = false; + +//const extractor = await pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2'); +//const emb = x => {postMessage(x) ; return extractor(x, {pooling:'mean', normalize:'true'})}; +return pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2') + .then(extractor => { + +const emb = x => extractor(x, {pooling:'mean', normalize:'true'}); + +async function createEmbeddings(xs) { + return (await Promise.all(xs.map(emb))).map((x,i) => ({value: xs[i], embeddings: x.data})); +} + +function generateEmbeddings({data: {group, dataset}}) { +const request = indexedDB.open("embeddings"); +request.onerror = () => { + console.error("Why didn't you allow my web app to use IndexedDB?!"); +}; + +request.onupgradeneeded = (event) => { + const db = event.target.result; + const objectStore = db.createObjectStore('embeddings', {autoIncrement: true}); + objectStore.createIndex("value", "value", { unique: false }); + }; + +request.onsuccess = (event) => { + const db = event.target.result; + let t = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); + t.onerror = () => console.error('transaction failed'); + t.onsuccess = e => { + const embeddings = e.target.result.filter(x => x.group === group); if (!embeddings || !embeddings.length) { - createEmbeddings(lines) + createEmbeddings(dataset) .then(xsembs => { - embeddings = xsembs; const st = db .transaction('embeddings', 'readwrite') .objectStore('embeddings'); xsembs.forEach(emb => { - console.log(emb); - st.add(emb); + st.add({...emb, group}); }); }); - } - }; + } +} +} + } + +self.onmessage = generateEmbeddings; +postMessage({loaded:true}); +return generateEmbeddings; + }); + }); + + +// const generateEmbeddings = self.onmessage; +// +//export default generateEmbeddings; + + |