diff options
author | dan <[email protected]> | 2023-06-07 21:19:21 -0400 |
---|---|---|
committer | dan <[email protected]> | 2023-06-07 21:19:21 -0400 |
commit | 5a5e2dd05623aad71d8343a5de0480cd8756f85b (patch) | |
tree | 7357cde29f61eb43fb8fbee49b4c452f953742e0 | |
parent | ace13cf4aa724c078a6a6f36a6f243cd4821a548 (diff) | |
download | dump-5a5e2dd05623aad71d8343a5de0480cd8756f85b.tar.gz dump-5a5e2dd05623aad71d8343a5de0480cd8756f85b.tar.bz2 dump-5a5e2dd05623aad71d8343a5de0480cd8756f85b.zip |
proto: working sem search with embedding done by workers
-rw-r--r-- | semanticsearchscratchpad/create-embeddings.js | 84 | ||||
-rw-r--r-- | semanticsearchscratchpad/index.html | 144 |
2 files changed, 153 insertions, 75 deletions
diff --git a/semanticsearchscratchpad/create-embeddings.js b/semanticsearchscratchpad/create-embeddings.js index bbb3a81..2d2c321 100644 --- a/semanticsearchscratchpad/create-embeddings.js +++ b/semanticsearchscratchpad/create-embeddings.js @@ -1,30 +1,72 @@ -onmessage = msg => { - const lines = msg.data; - - const request = indexedDB.open("embeddings"); - - request.onupgradeneeded = (event) => { - console.log('onupgradeneeded') - const db = event.target.result; - db.createObjectStore('embeddings', {autoIncrement: true}); - }; - - let embeddings; - request.onsuccess = (event) => { - const db = event.target.result; - embeddings = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); + +//const yemb = (await emb('I really like curry.')).data; +// +// +//console.log('out', xsembs.map(x => ({value:x.value, similarity: cosineSimilarity(x.embeddings, yemb)}))); +// +//import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]'; + +//importScripts('./embeddings-lib.js'); + +import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]') + .then(({pipeline, env}) => { + +env.allowLocalModels = false; + +//const extractor = await pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2'); +//const emb = x => {postMessage(x) ; return extractor(x, {pooling:'mean', normalize:'true'})}; +return pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2') + .then(extractor => { + +const emb = x => extractor(x, {pooling:'mean', normalize:'true'}); + +async function createEmbeddings(xs) { + return (await Promise.all(xs.map(emb))).map((x,i) => ({value: xs[i], embeddings: x.data})); +} + +function generateEmbeddings({data: {group, dataset}}) { +const request = indexedDB.open("embeddings"); +request.onerror = () => { + console.error("Why didn't you allow my web app to use IndexedDB?!"); +}; + +request.onupgradeneeded = (event) => { + const db = event.target.result; + const objectStore = db.createObjectStore('embeddings', {autoIncrement: true}); + objectStore.createIndex("value", "value", { unique: false }); + }; + +request.onsuccess = (event) => { + const db = event.target.result; + let t = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); + t.onerror = () => console.error('transaction failed'); + t.onsuccess = e => { + const embeddings = e.target.result.filter(x => x.group === group); if (!embeddings || !embeddings.length) { - createEmbeddings(lines) + createEmbeddings(dataset) .then(xsembs => { - embeddings = xsembs; const st = db .transaction('embeddings', 'readwrite') .objectStore('embeddings'); xsembs.forEach(emb => { - console.log(emb); - st.add(emb); + st.add({...emb, group}); }); }); - } - }; + } +} +} + } + +self.onmessage = generateEmbeddings; +postMessage({loaded:true}); +return generateEmbeddings; + }); + }); + + +// const generateEmbeddings = self.onmessage; +// +//export default generateEmbeddings; + + diff --git a/semanticsearchscratchpad/index.html b/semanticsearchscratchpad/index.html index 8a96b54..7657c94 100644 --- a/semanticsearchscratchpad/index.html +++ b/semanticsearchscratchpad/index.html @@ -10,20 +10,65 @@ </div> </form> - <p class="result">Result: 0</p> + <pre id="result"></pre> </div> + <!-- <script src="dataset.js"></script> --!> <script type="module"> +import {Stripe_1, +Stripe_2, +Stripe_3, +Gmail_1, +Gmail_2, +Gmail_3, +Alexa_1, +Alexa_2, +Alexa_3 +} from './dataset.js'; + import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]'; env.allowLocalModels = false; +const extractor = await pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2'); +const emb = x => extractor(x, {pooling:'mean', normalize:'true'}); + + +const s = {Stripe_1, +Stripe_2, +Stripe_3, +Gmail_1, +Gmail_2, +Gmail_3, +Alexa_1, +Alexa_2, +Alexa_3}; + + +//import genE from './create-embeddings.js'; + +//genE({data: {group:'Stripe_1', dataset: Stripe_1}}); + +Object.keys(s).map(k => { +const embWorker = new Worker("create-embeddings.js"); +//const k = 'Gmail_1'; +embWorker.onmessage = ({data}) => { + if (data.loaded) { + embWorker.postMessage({group:k, dataset: s[k]}); + console.log('Message posted to worker', {group:k, dataset: s[k]}); + } else { + console.error(data); + } +} +}); -let pipe = await pipeline('embeddings'); -let emb = x => pipe(x, {pooling:'mean', normalize:'true'}); -async function createEmbeddings(xs) { - return (await Promise.all(xs.map(emb))).map((x,i) => ({value: xs[i], embeddings: x.data})); -} +//let pipe = await pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2'); +// +//let emb = x => pipe(x, {pooling:'mean', normalize:'true'}); +// +//async function createEmbeddings(xs) { +// return (await Promise.all(xs.map(emb))).map((x,i) => ({value: xs[i], embeddings: x.data})); +//} function dotp(x, y) { function dotp_sum(a, b) { @@ -42,36 +87,28 @@ function cosineSimilarity(A,B){ -/** Open DB **/ - - -/****/ - - - //const yemb = (await emb('I really like curry.')).data; // // //console.log('out', xsembs.map(x => ({value:x.value, similarity: cosineSimilarity(x.embeddings, yemb)}))); // -/******/ - -const request = indexedDB.open("embeddings"); -request.onerror = (event) => { - console.error("Why didn't you allow my web app to use IndexedDB?!"); -}; - -request.onupgradeneeded = (event) => { - console.log('onupgradeneeded') - const db = event.target.result; - db.createObjectStore('embeddings', {autoIncrement: true}); - }; - +//const request = indexedDB.open("embeddings"); +//request.onerror = (event) => { +// console.error("Why didn't you allow my web app to use IndexedDB?!"); +//}; +// +//request.onupgradeneeded = (event) => { +// console.log('onupgradeneeded') +// const db = event.target.result; +// const objectStore = db.createObjectStore('embeddings', {autoIncrement: true}); +// objectStore.createIndex("value", "value", { unique: false }); +// }; +// var embeddings; - number1.onchange = e => { + const query = e.target.value; const request = indexedDB.open("embeddings"); request.onsuccess = (event) => { const db = event.target.result; @@ -80,14 +117,14 @@ number1.onchange = e => { // embeddings.onsuccess = => embeddings = e.target.result; if (embeddings && embeddings.length) { - emb('I really like curry.').then(yemb =>{ + emb(query).then(yemb =>{ const r = embeddings .map(x => ({value:x.value, similarity: cosineSimilarity(x.embeddings, yemb.data)})); - console.log(r) - result.innerHTML = r - .sort((a,b) => a.similarity > b.similarity) - .map(({value, similarity}) => `<div>${value}: ${similarity}</div>`) - .join('<br/>'); + result.textContent = r + .sort((a,b) => b.similarity - a.similarity) + .slice(0, 10) + .map(({value, similarity}) => `${similarity}: ${value}`) + .join('\n'); } ); } else { @@ -97,26 +134,26 @@ number1.onchange = e => { } } -request.onsuccess = (event) => { - const db = event.target.result; - let t = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); - t.onerror = () => console.error('transaction failed'); - t.onsuccess = e => { - const embeddings = e.target.result; - if (!embeddings || !embeddings.length) { - createEmbeddings(['Jim likes curry', 'I really dislike potatoes', 'We went to Mars last week']) - .then(xsembs => { - const st = db - .transaction('embeddings', 'readwrite') - .objectStore('embeddings'); - xsembs.forEach(emb => { - console.log(emb); - st.add(emb); - }); - }); - } -} -} +//request.onsuccess = (event) => { +// const db = event.target.result; +// let t = db.transaction('embeddings', 'readwrite').objectStore('embeddings').getAll(); +// t.onerror = () => console.error('transaction failed'); +// t.onsuccess = e => { +// const embeddings = e.target.result; +// if (!embeddings || !embeddings.length) { +// createEmbeddings(ds) +// .then(xsembs => { +// const st = db +// .transaction('embeddings', 'readwrite') +// .objectStore('embeddings'); +// xsembs.forEach(emb => { +// console.log(emb); +// st.add(emb); +// }); +// }); +// } +//} +//} @@ -138,7 +175,6 @@ request.onsuccess = (event) => { const first = document.querySelector('#number1'); const second = document.querySelector('#number2'); -const result = document.querySelector('.result'); const myWorker = new Worker("worker.js"); |