Skip to content

Commit 1fde9e0

Browse files
committed
adding google gemini
1 parent 0c135cb commit 1fde9e0

File tree

14 files changed

+377
-51
lines changed

14 files changed

+377
-51
lines changed

services/video-search/.env.example

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,26 @@ REDIS_URL=<redis[s]://[[username][:password]@][host][:port][/db-number]>
55
SEARCHAPI_API_KEY=<https://www.searchapi.io/>
66
YOUTUBE_VIDEOS=<video-id-1,video-id-2,video-id-3>
77

8+
GOOGLE_API_KEY=<https://console.cloud.google.com/apis/credentials>
9+
GOOGLE_EMBEDDING_MODEL=<https://ai.google.dev/models/gemini#model_variations>
10+
GOOGLE_SUMMARY_MODEL=<https://ai.google.dev/models/gemini#model_variations>
11+
GOOGLE_VIDEO_INDEX_NAME=<redis-video-index-name>
12+
GOOGLE_VIDEO_PREFIX=<redis-video-prefix>
13+
GOOGLE_VECTOR_SET=<redis-vector-set>
14+
GOOGLE_SUMMARY_PREFIX=<redis-summary-prefix>
15+
16+
HF_EMBEDDING_MODEL=<https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js>
17+
HF_SUMMARY_MODEL=<https://huggingface.co/models?pipeline_tag=summarization&library=transformers.js>
18+
HF_VIDEO_INDEX_NAME=<redis-video-index-name>
19+
HF_VIDEO_PREFIX=<redis-video-prefix>
20+
HF_VECTOR_SET=<redis-vector-set>
21+
HF_SUMMARY_PREFIX=<redis-summary-prefix>
22+
823
OPENAI_API_KEY=<https://platform.openai.com/api-keys>
924
OPENAI_ORGANIZATION=<https://platform.openai.com/account/organization>
1025
OPENAI_EMBEDDING_MODEL=<https://platform.openai.com/account/limits>
1126
OPENAI_SUMMARY_MODEL=<https://platform.openai.com/account/limits>
1227
OPENAI_VIDEO_INDEX_NAME=<redis-video-index-name>
1328
OPENAI_VIDEO_PREFIX=<redis-video-prefix>
1429
OPENAI_VECTOR_SET=<redis-vector-set>
15-
16-
HF_EMBEDDING_MODEL=<https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js>
17-
HF_SUMMARY_MODEL=<https://huggingface.co/models?pipeline_tag=summarization&library=transformers.js>
18-
HF_VIDEO_INDEX_NAME=<redis-video-index-name>
19-
HF_VIDEO_PREFIX=<redis-video-prefix>
20-
HF_VECTOR_SET=<redis-vector-set>
30+
OPENAI_SUMMARY_PREFIX=<redis-summary-prefix>

services/video-search/package-lock.json

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

services/video-search/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"watch": "nodemon --watch dist dist/index.js"
1919
},
2020
"dependencies": {
21+
"@langchain/google-genai": "^0.0.5",
2122
"@tensorflow-models/mobilenet": "^2.1.1",
2223
"@tensorflow/tfjs": "^4.15.0",
2324
"@tensorflow/tfjs-node": "^4.15.0",

services/video-search/src/config.ts

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,35 @@ const {
1010
YOUTUBE_VIDEOS,
1111
REDIS_URL,
1212
SEARCHAPI_API_KEY,
13+
GOOGLE_VIDEO_INDEX_NAME,
14+
GOOGLE_VIDEO_PREFIX,
15+
GOOGLE_API_KEY,
16+
GOOGLE_EMBEDDING_MODEL,
17+
GOOGLE_SUMMARY_MODEL,
18+
GOOGLE_VECTOR_SET,
19+
GOOGLE_SUMMARY_PREFIX,
1320
HF_VIDEO_INDEX_NAME,
1421
HF_VIDEO_PREFIX,
1522
HF_EMBEDDING_MODEL,
1623
HF_SUMMARY_MODEL,
1724
HF_VECTOR_SET,
25+
HF_SUMMARY_PREFIX,
1826
OPENAI_VIDEO_INDEX_NAME,
27+
OPENAI_VIDEO_PREFIX,
1928
OPENAI_API_KEY,
2029
OPENAI_ORGANIZATION,
2130
OPENAI_EMBEDDING_MODEL,
2231
OPENAI_SUMMARY_MODEL,
2332
OPENAI_VECTOR_SET,
33+
OPENAI_SUMMARY_PREFIX,
2434
USE,
2535
} = process.env;
2636

37+
const DEFAULT_VIDEO_INDEX_NAME = 'idx-videos';
38+
const DEFAULT_VIDEO_PREFIX = 'video';
39+
const DEFAULT_VECTOR_SET = 'video-vectors';
40+
const DEFAULT_SUMMARY_PREFIX = 'video-summary';
41+
2742
export default {
2843
app: {
2944
NAME: npm_package_name ?? 'video-search',
@@ -55,23 +70,39 @@ export default {
5570
API_KEY: SEARCHAPI_API_KEY ?? '',
5671
},
5772
hf: {
58-
VIDEO_INDEX_NAME: HF_VIDEO_INDEX_NAME ?? 'idx-videos-hf',
59-
VIDEO_PREFIX: HF_VIDEO_PREFIX ?? 'video-hf:',
73+
VIDEO_INDEX_NAME: HF_VIDEO_INDEX_NAME ?? `${DEFAULT_VIDEO_INDEX_NAME}-hf`,
74+
VIDEO_PREFIX: HF_VIDEO_PREFIX ?? `${DEFAULT_VIDEO_PREFIX}-hf:`,
6075
EMBEDDING_MODEL: HF_EMBEDDING_MODEL ?? 'Xenova/all-MiniLM-L6-v2',
6176
SUMMARY_MODEL: HF_SUMMARY_MODEL ?? 'Xenova/paraphrase-albert-small-v2',
62-
VECTOR_SET: HF_VECTOR_SET ?? 'video-vectors-hf',
77+
VECTOR_SET: HF_VECTOR_SET ?? `${DEFAULT_VECTOR_SET}-hf`,
78+
SUMMARY_PREFIX: HF_SUMMARY_PREFIX ?? `${DEFAULT_SUMMARY_PREFIX}-hf:`,
79+
},
80+
google: {
81+
VIDEO_INDEX_NAME:
82+
GOOGLE_VIDEO_INDEX_NAME ?? `${DEFAULT_VIDEO_INDEX_NAME}-google`,
83+
VIDEO_PREFIX: GOOGLE_VIDEO_PREFIX ?? `${DEFAULT_VIDEO_PREFIX}-google:`,
84+
API_KEY: GOOGLE_API_KEY,
85+
EMBEDDING_MODEL: GOOGLE_EMBEDDING_MODEL ?? 'embedding-001',
86+
SUMMARY_MODEL: GOOGLE_SUMMARY_MODEL ?? 'gemini-pro',
87+
VECTOR_SET: GOOGLE_VECTOR_SET ?? `${DEFAULT_VECTOR_SET}-google`,
88+
SUMMARY_PREFIX:
89+
GOOGLE_SUMMARY_PREFIX ?? `${DEFAULT_SUMMARY_PREFIX}-google:`,
6390
},
6491
openai: {
65-
VIDEO_INDEX_NAME: OPENAI_VIDEO_INDEX_NAME ?? 'idx-videos',
66-
VIDEO_PREFIX: OPENAI_VIDEO_INDEX_NAME ?? 'video:',
92+
VIDEO_INDEX_NAME:
93+
OPENAI_VIDEO_INDEX_NAME ?? `${DEFAULT_VIDEO_INDEX_NAME}-openai`,
94+
VIDEO_PREFIX: OPENAI_VIDEO_PREFIX ?? `${DEFAULT_VIDEO_PREFIX}-openai:`,
6795
API_KEY: OPENAI_API_KEY,
6896
ORGANIZATION: OPENAI_ORGANIZATION,
6997
EMBEDDING_MODEL: OPENAI_EMBEDDING_MODEL ?? 'gpt-4',
7098
SUMMARY_MODEL: OPENAI_SUMMARY_MODEL ?? 'gpt-4',
71-
VECTOR_SET: OPENAI_VECTOR_SET ?? 'video-vectors',
99+
VECTOR_SET: OPENAI_VECTOR_SET ?? `${DEFAULT_VECTOR_SET}-openai`,
100+
SUMMARY_PREFIX:
101+
OPENAI_SUMMARY_PREFIX ?? `${DEFAULT_SUMMARY_PREFIX}-openai:`,
72102
},
73103
use: {
74-
OPENAI: USE === 'OPENAI',
104+
GOOGLE: USE === 'GOOGLE',
75105
HF: USE === 'HF',
106+
OPENAI: USE === 'OPENAI',
76107
},
77108
};
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import config from '../config.js';
2+
import { client } from '../db.js';
3+
import { RedisVectorStore } from 'langchain/vectorstores/redis';
4+
import { ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings } from '@langchain/google-genai';
5+
6+
export const llm = new ChatGoogleGenerativeAI({
7+
apiKey: config.google.API_KEY,
8+
modelName: config.google.SUMMARY_MODEL,
9+
maxOutputTokens: 10000,
10+
});
11+
12+
export function getEmbeddings(modelName?: string) {
13+
return new GoogleGenerativeAIEmbeddings({
14+
apiKey: config.google.API_KEY,
15+
modelName: modelName ?? config.google.EMBEDDING_MODEL
16+
});
17+
}
18+
19+
export const vectorStore = new RedisVectorStore(getEmbeddings(), {
20+
redisClient: client,
21+
indexName: config.google.VIDEO_INDEX_NAME,
22+
keyPrefix: config.google.VIDEO_PREFIX,
23+
});
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
export * as summarize from './summarize.js';
2+
export { search } from './search.js';
3+
export { store } from './store.js';
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import * as summarize from './summarize.js';
2+
import { VideoDocument } from '../transcripts/index.js';
3+
import { vectorStore } from './config.js';
4+
import log from '../log.js';
5+
6+
async function getVideos(question: string) {
7+
log.debug(
8+
`Performing similarity search for videos that answer: ${question}`,
9+
{
10+
question,
11+
location: 'google.search.search',
12+
},
13+
);
14+
15+
const KNN = 3;
16+
/* Simple standalone search in the vector DB */
17+
return vectorStore.similaritySearch(question, KNN) as Promise<
18+
VideoDocument[]
19+
>;
20+
}
21+
22+
export async function search(question: string) {
23+
log.debug(`Original question: ${question}`, {
24+
location: 'google.search.search',
25+
});
26+
const semanticQuestion = await summarize.question(question);
27+
28+
log.debug(`Semantic question: ${semanticQuestion}`, {
29+
location: 'google.search.search',
30+
});
31+
let videos = await getVideos(semanticQuestion);
32+
33+
if (videos.length === 0) {
34+
log.debug('No videos found for semantic question, trying with original question', {
35+
location: 'google.search.search',
36+
});
37+
38+
videos = await getVideos(question);
39+
}
40+
41+
return videos;
42+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import { client } from '../db.js';
2+
import { vectorStore } from './config.js';
3+
import { VideoDocument } from '../transcripts/index.js';
4+
import config from '../config.js';
5+
import log from '../log.js';
6+
7+
export async function store(documents: VideoDocument[]) {
8+
log.debug('Storing documents...', {
9+
location: 'google.store.store',
10+
});
11+
const newDocuments: VideoDocument[] = [];
12+
13+
await Promise.all(
14+
documents.map(async (doc) => {
15+
const exists = await client.sIsMember(
16+
config.google.VECTOR_SET,
17+
doc.metadata.id,
18+
);
19+
20+
if (!exists) {
21+
newDocuments.push(doc);
22+
}
23+
}),
24+
);
25+
26+
log.debug(`Found ${newDocuments.length} new documents`, {
27+
location: 'google.store.store',
28+
});
29+
30+
if (newDocuments.length === 0) {
31+
return;
32+
}
33+
34+
await vectorStore.addDocuments(newDocuments);
35+
36+
await Promise.all(
37+
newDocuments.map(async (doc) => {
38+
await client.sAdd(config.google.VECTOR_SET, doc.metadata.id);
39+
}),
40+
);
41+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import { Document } from 'langchain/document';
2+
import { TokenTextSplitter } from 'langchain/text_splitter';
3+
import { VideoDocument } from '../transcripts/index.js';
4+
import {
5+
QUESTION_PROMPT,
6+
SUMMARY_PROMPT,
7+
SUMMARY_REFINE_PROMPT,
8+
} from '../templates/index.js';
9+
import { loadSummarizationChain } from 'langchain/chains';
10+
import { llm } from './config.js';
11+
import { StringOutputParser } from 'langchain/schema/output_parser';
12+
import { cacheAside } from '../db.js';
13+
import log from '../log.js';
14+
import config from '../config.js';
15+
16+
const splitter = new TokenTextSplitter({
17+
chunkSize: 10000,
18+
chunkOverlap: 250,
19+
});
20+
21+
const videoSummarizeChain = loadSummarizationChain(llm, {
22+
type: 'refine',
23+
questionPrompt: SUMMARY_PROMPT,
24+
refinePrompt: SUMMARY_REFINE_PROMPT,
25+
});
26+
27+
const questionSummarizeChain = QUESTION_PROMPT.pipe(llm).pipe(
28+
new StringOutputParser(),
29+
);
30+
31+
const cache = cacheAside(config.google.SUMMARY_PREFIX);
32+
33+
export async function docs(allDocs: VideoDocument[][]) {
34+
const summarizedDocs: VideoDocument[] = [];
35+
36+
for (const docs of allDocs) {
37+
log.debug(`Summarizing ${docs[0].metadata.link}`, {
38+
...docs[0].metadata,
39+
location: 'google.summarize.docs',
40+
});
41+
const existingSummary = await cache.get(docs[0].metadata.id);
42+
43+
if (existingSummary) {
44+
summarizedDocs.push(
45+
new Document({
46+
metadata: docs[0].metadata,
47+
pageContent: existingSummary,
48+
}),
49+
);
50+
51+
continue;
52+
}
53+
54+
const docsSummary = await splitter.splitDocuments(docs);
55+
const summary = await videoSummarizeChain.run(docsSummary);
56+
57+
log.debug(`Summarized ${docs[0].metadata.link}:\n ${summary}`, {
58+
summary,
59+
location: 'google.summarize.docs',
60+
});
61+
await cache.set(docs[0].metadata.id, summary);
62+
63+
summarizedDocs.push(
64+
new Document({
65+
metadata: docs[0].metadata,
66+
pageContent: summary,
67+
}),
68+
);
69+
}
70+
71+
return summarizedDocs;
72+
}
73+
74+
export async function question(question: string) {
75+
const summary = await questionSummarizeChain.invoke({
76+
question,
77+
});
78+
79+
return summary;
80+
}

0 commit comments

Comments
 (0)