From d8350591c84f945b2a0b133d2f54cbb51e53e8b4 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 11 Mar 2025 19:54:59 +0100 Subject: [PATCH] feat: Add scraping tool option --- src/server.ts | 66 ++++++++------------------------------------------- 1 file changed, 10 insertions(+), 56 deletions(-) diff --git a/src/server.ts b/src/server.ts index de7efc9..2b366e1 100644 --- a/src/server.ts +++ b/src/server.ts @@ -8,8 +8,6 @@ import { Server } from '@modelcontextprotocol/sdk/server/index.js'; import type { Transport } from '@modelcontextprotocol/sdk/shared/transport.js'; import { CallToolRequestSchema, - GetPromptRequestSchema, - ListPromptsRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js'; import dotenv from 'dotenv'; @@ -32,28 +30,13 @@ const WebBrowserArgsSchema = z.object({ .describe( 'The maximum number of top organic Google Search results whose web pages will be extracted', ), + scrapingTool: z.enum(['browser-playwright', 'raw-http']) + .describe('Select a scraping tool for extracting the target web pages. ' + + 'The Browser tool is more powerful and can handle JavaScript heavy websites, while the ' + + 'Plain HTML tool can not handle JavaScript but is about two times faster.') + .default('raw-http'), }); -const PROMPTS = [ - { - name: TOOL_SEARCH, - description: 'Search phrase or a URL at Google and return crawled web pages as text or Markdown', - arguments: [ - { - name: 'query', - description: 'Google Search keywords or a URL of a specific web page', - required: true, - }, - { - name: 'maxResults', - description: 'The maximum number of top organic Google Search results whose web pages' - + ' will be extracted (default: 1)', - required: false, - }, - ], - }, -]; - /** * Create an MCP server with a tool to call RAG Web Browser Actor */ @@ -74,11 +57,10 @@ export class RagWebBrowserServer { }, ); this.setupErrorHandling(); - this.setupPromptHandlers(); this.setupToolHandlers(); } - private async callRagWebBrowser(query: string, maxResults: number): Promise { + private async callRagWebBrowser(query: string, maxResults: number, scrapingTool: string): Promise { if (!APIFY_API_TOKEN) { throw new Error('APIFY_API_TOKEN is required but not set. ' + 'Please set it in your environment variables or pass it as a command-line argument.'); @@ -87,6 +69,7 @@ export class RagWebBrowserServer { const queryParams = new URLSearchParams({ query, maxResults: maxResults.toString(), + scrapingTool, }); const url = `${ACTOR_BASE_URL}?${queryParams.toString()}`; const response = await fetch(url, { @@ -114,43 +97,14 @@ export class RagWebBrowserServer { }); } - private setupPromptHandlers(): void { - this.server.setRequestHandler(ListPromptsRequestSchema, async () => { - return { - prompts: PROMPTS, - }; - }); - - this.server.setRequestHandler(GetPromptRequestSchema, async (request) => { - const { name, arguments: args } = request.params; - switch (name) { - case TOOL_SEARCH: { - const parsed = WebBrowserArgsSchema.parse(args); - const content = await this.callRagWebBrowser(parsed.query, parsed.maxResults); - return { - description: `Markdown content for search query: ${parsed.query}`, - messages: [ - { - role: 'user', - content: { type: 'text', text: content }, - }, - ], - }; - } - default: { - throw new Error(`Unknown prompt: ${name}`); - } - } - }); - } - private setupToolHandlers(): void { this.server.setRequestHandler(ListToolsRequestSchema, async () => { return { tools: [ { name: TOOL_SEARCH, - description: 'Search phrase or a URL at Google and return crawled web pages as text or Markdown', + description: 'Search phrase or a URL at Google and return crawled web pages as text or Markdown. ' + + 'Prefer HTTP client for speed and browser-playwright for reability.', inputSchema: zodToJsonSchema(WebBrowserArgsSchema), }, ], @@ -161,7 +115,7 @@ export class RagWebBrowserServer { switch (name) { case TOOL_SEARCH: { const parsed = WebBrowserArgsSchema.parse(args); - const content = await this.callRagWebBrowser(parsed.query, parsed.maxResults); + const content = await this.callRagWebBrowser(parsed.query, parsed.maxResults, parsed.scrapingTool); return { content: [{ type: 'text', text: content }], };