From 5a5e2dd05623aad71d8343a5de0480cd8756f85b Mon Sep 17 00:00:00 2001 From: dan Date: Wed, 7 Jun 2023 21:19:21 -0400 Subject: proto: working sem search with embedding done by workers --- semanticsearchscratchpad/create-embeddings.js | 84 ++++++++++++++++++++------- 1 file changed, 63 insertions(+), 21 deletions(-) (limited to 'semanticsearchscratchpad/create-embeddings.js') diff --git a/semanticsearchscratchpad/create-embeddings.js b/semanticsearchscratchpad/create-embeddings.js index bbb3a81..2d2c321 100644 --- a/semanticsearchscratchpad/create-embeddings.js +++ b/semanticsearchscratchpad/create-embeddings.js @@ -1,30 +1,72 @@ -onmessage = msg => { - const lines = msg.data; - - const request = indexedDB.open("embeddings"); - - request.onupgradeneeded = (event) => { - console.log('onupgradeneeded') - const db = event.target.result; - db.createObjectStore('embeddings', {autoIncrement: true}); - }; - - let embeddings; - request.onsuccess = (event) => { - const db = event.target.result; - embeddings = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); + +//const yemb = (await emb('I really like curry.')).data; +// +// +//console.log('out', xsembs.map(x => ({value:x.value, similarity: cosineSimilarity(x.embeddings, yemb)}))); +// +//import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.1.1'; + +//importScripts('./embeddings-lib.js'); + +import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.1.1') + .then(({pipeline, env}) => { + +env.allowLocalModels = false; + +//const extractor = await pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2'); +//const emb = x => {postMessage(x) ; return extractor(x, {pooling:'mean', normalize:'true'})}; +return pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2') + .then(extractor => { + +const emb = x => extractor(x, {pooling:'mean', normalize:'true'}); + +async function createEmbeddings(xs) { + return (await Promise.all(xs.map(emb))).map((x,i) => ({value: xs[i], embeddings: x.data})); +} + +function generateEmbeddings({data: {group, dataset}}) { +const request = indexedDB.open("embeddings"); +request.onerror = () => { + console.error("Why didn't you allow my web app to use IndexedDB?!"); +}; + +request.onupgradeneeded = (event) => { + const db = event.target.result; + const objectStore = db.createObjectStore('embeddings', {autoIncrement: true}); + objectStore.createIndex("value", "value", { unique: false }); + }; + +request.onsuccess = (event) => { + const db = event.target.result; + let t = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); + t.onerror = () => console.error('transaction failed'); + t.onsuccess = e => { + const embeddings = e.target.result.filter(x => x.group === group); if (!embeddings || !embeddings.length) { - createEmbeddings(lines) + createEmbeddings(dataset) .then(xsembs => { - embeddings = xsembs; const st = db .transaction('embeddings', 'readwrite') .objectStore('embeddings'); xsembs.forEach(emb => { - console.log(emb); - st.add(emb); + st.add({...emb, group}); }); }); - } - }; + } +} +} + } + +self.onmessage = generateEmbeddings; +postMessage({loaded:true}); +return generateEmbeddings; + }); + }); + + +// const generateEmbeddings = self.onmessage; +// +//export default generateEmbeddings; + + -- cgit v1.2.3