From 503dbd19b8cec4d2ff4575786b0eec25db2e80e6 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Sat, 1 Feb 2025 16:23:45 +0100 Subject: [PATCH 1/7] feat: merged localscraper into smartscraper --- .../examples/localscraper_example.py | 31 --------- .../examples/smartscraper_example.py | 2 + .../examples/smartscraper_schema_example.py | 1 + scrapegraph-py/scrapegraph_py/async_client.py | 51 +++----------- scrapegraph-py/scrapegraph_py/client.py | 49 +++----------- .../scrapegraph_py/models/localscraper.py | 67 ------------------- .../scrapegraph_py/models/smartscraper.py | 38 ++++++++--- 7 files changed, 48 insertions(+), 191 deletions(-) delete mode 100644 scrapegraph-py/examples/localscraper_example.py delete mode 100644 scrapegraph-py/scrapegraph_py/models/localscraper.py diff --git a/scrapegraph-py/examples/localscraper_example.py b/scrapegraph-py/examples/localscraper_example.py deleted file mode 100644 index 65bbed6..0000000 --- a/scrapegraph-py/examples/localscraper_example.py +++ /dev/null @@ -1,31 +0,0 @@ -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - -# Initialize the client -sgai_client = Client(api_key="your-api-key-here") - -# Example HTML content -html_content = """ - - -

Company Name

-

We are a technology company focused on AI solutions.

-
-

Email: contact@example.com

-

Phone: (555) 123-4567

-
- - -""" - -# LocalScraper request -response = sgai_client.localscraper( - user_prompt="Extract the company description and contact information", - website_html=html_content, -) - -# Print the response -print(f"Request ID: {response['request_id']}") -print(f"Result: {response['result']}") diff --git a/scrapegraph-py/examples/smartscraper_example.py b/scrapegraph-py/examples/smartscraper_example.py index 37e4542..f583804 100644 --- a/scrapegraph-py/examples/smartscraper_example.py +++ b/scrapegraph-py/examples/smartscraper_example.py @@ -9,9 +9,11 @@ # SmartScraper request response = sgai_client.smartscraper( website_url="https://example.com", + # website_html="...", # Optional, if you want to pass in HTML content instead of a URL user_prompt="Extract the main heading, description, and summary of the webpage", ) + # Print the response print(f"Request ID: {response['request_id']}") print(f"Result: {response['result']}") diff --git a/scrapegraph-py/examples/smartscraper_schema_example.py b/scrapegraph-py/examples/smartscraper_schema_example.py index 3553a22..5b54bd8 100644 --- a/scrapegraph-py/examples/smartscraper_schema_example.py +++ b/scrapegraph-py/examples/smartscraper_schema_example.py @@ -16,6 +16,7 @@ class WebpageSchema(BaseModel): # SmartScraper request with output schema response = sgai_client.smartscraper( website_url="https://example.com", + # website_html="...", # Optional, if you want to pass in HTML content instead of a URL user_prompt="Extract webpage information", output_schema=WebpageSchema, ) diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index ce74b96..6943fc6 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -9,10 +9,6 @@ from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.localscraper import ( - GetLocalScraperRequest, - LocalScraperRequest, -) from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, @@ -165,16 +161,22 @@ async def get_markdownify(self, request_id: str): async def smartscraper( self, - website_url: str, user_prompt: str, + website_url: Optional[str] = None, + website_html: Optional[str] = None, output_schema: Optional[BaseModel] = None, ): """Send a smartscraper request""" - logger.info(f"🔍 Starting smartscraper request for {website_url}") + logger.info("🔍 Starting smartscraper request") + if website_url: + logger.debug(f"🌐 URL: {website_url}") + if website_html: + logger.debug("📄 Using provided HTML content") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( website_url=website_url, + website_html=website_html, user_prompt=user_prompt, output_schema=output_schema, ) @@ -200,43 +202,6 @@ async def get_smartscraper(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result - async def localscraper( - self, - user_prompt: str, - website_html: str, - output_schema: Optional[BaseModel] = None, - ): - """Send a localscraper request""" - logger.info("🔍 Starting localscraper request") - logger.debug(f"📝 Prompt: {user_prompt}") - - request = LocalScraperRequest( - user_prompt=user_prompt, - website_html=website_html, - output_schema=output_schema, - ) - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/localscraper", json=request.model_dump() - ) - logger.info("✨ Localscraper request completed successfully") - return result - - async def get_localscraper(self, request_id: str): - """Get the result of a previous localscraper request""" - logger.info(f"🔍 Fetching localscraper result for request {request_id}") - - # Validate input using Pydantic model - GetLocalScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = await self._make_request( - "GET", f"{API_BASE_URL}/localscraper/{request_id}" - ) - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return result - async def submit_feedback( self, request_id: str, rating: int, feedback_text: Optional[str] = None ): diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 5e61a21..3021549 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -10,10 +10,6 @@ from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.localscraper import ( - GetLocalScraperRequest, - LocalScraperRequest, -) from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, @@ -175,16 +171,22 @@ def get_markdownify(self, request_id: str): def smartscraper( self, - website_url: str, user_prompt: str, + website_url: Optional[str] = None, + website_html: Optional[str] = None, output_schema: Optional[BaseModel] = None, ): """Send a smartscraper request""" - logger.info(f"🔍 Starting smartscraper request for {website_url}") + logger.info("🔍 Starting smartscraper request") + if website_url: + logger.debug(f"🌐 URL: {website_url}") + if website_html: + logger.debug("📄 Using provided HTML content") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( website_url=website_url, + website_html=website_html, user_prompt=user_prompt, output_schema=output_schema, ) @@ -208,41 +210,6 @@ def get_smartscraper(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result - def localscraper( - self, - user_prompt: str, - website_html: str, - output_schema: Optional[BaseModel] = None, - ): - """Send a localscraper request""" - logger.info("🔍 Starting localscraper request") - logger.debug(f"📝 Prompt: {user_prompt}") - - request = LocalScraperRequest( - user_prompt=user_prompt, - website_html=website_html, - output_schema=output_schema, - ) - logger.debug("✅ Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/localscraper", json=request.model_dump() - ) - logger.info("✨ Localscraper request completed successfully") - return result - - def get_localscraper(self, request_id: str): - """Get the result of a previous localscraper request""" - logger.info(f"🔍 Fetching localscraper result for request {request_id}") - - # Validate input using Pydantic model - GetLocalScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/localscraper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return result - def submit_feedback( self, request_id: str, rating: int, feedback_text: Optional[str] = None ): diff --git a/scrapegraph-py/scrapegraph_py/models/localscraper.py b/scrapegraph-py/scrapegraph_py/models/localscraper.py deleted file mode 100644 index cdcfe37..0000000 --- a/scrapegraph-py/scrapegraph_py/models/localscraper.py +++ /dev/null @@ -1,67 +0,0 @@ -# Models for localscraper endpoint - -from typing import Optional, Type -from uuid import UUID - -from bs4 import BeautifulSoup -from pydantic import BaseModel, Field, model_validator - - -class LocalScraperRequest(BaseModel): - user_prompt: str = Field( - ..., - example="Extract info about the company", - ) - website_html: str = Field( - ..., - example="

Title

Content

", - description="HTML content, maximum size 2MB", - ) - output_schema: Optional[Type[BaseModel]] = None - - @model_validator(mode="after") - def validate_user_prompt(self) -> "LocalScraperRequest": - if self.user_prompt is None or not self.user_prompt.strip(): - raise ValueError("User prompt cannot be empty") - if not any(c.isalnum() for c in self.user_prompt): - raise ValueError("User prompt must contain a valid prompt") - return self - - @model_validator(mode="after") - def validate_website_html(self) -> "LocalScraperRequest": - if self.website_html is None or not self.website_html.strip(): - raise ValueError("Website HTML cannot be empty") - - if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024: - raise ValueError("Website HTML content exceeds maximum size of 2MB") - - try: - soup = BeautifulSoup(self.website_html, "html.parser") - if not soup.find(): - raise ValueError("Invalid HTML - no parseable content found") - except Exception as e: - raise ValueError(f"Invalid HTML structure: {str(e)}") - - return self - - def model_dump(self, *args, **kwargs) -> dict: - data = super().model_dump(*args, **kwargs) - # Convert the Pydantic model schema to dict if present - if self.output_schema is not None: - data["output_schema"] = self.output_schema.model_json_schema() - return data - - -class GetLocalScraperRequest(BaseModel): - """Request model for get_localscraper endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetLocalScraperRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index c1fdd9a..e589346 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -3,6 +3,7 @@ from typing import Optional, Type from uuid import UUID +from bs4 import BeautifulSoup from pydantic import BaseModel, Field, model_validator @@ -11,7 +12,14 @@ class SmartScraperRequest(BaseModel): ..., example="Extract info about the company", ) - website_url: str = Field(..., example="https://scrapegraphai.com/") + website_url: Optional[str] = Field( + default=None, example="https://scrapegraphai.com/" + ) + website_html: Optional[str] = Field( + default=None, + example="

Title

Content

", + description="HTML content, maximum size 2MB", + ) output_schema: Optional[Type[BaseModel]] = None @model_validator(mode="after") @@ -23,14 +31,26 @@ def validate_user_prompt(self) -> "SmartScraperRequest": return self @model_validator(mode="after") - def validate_url(self) -> "SmartScraperRequest": - if self.website_url is None or not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("Invalid URL") + def validate_url_and_html(self) -> "SmartScraperRequest": + if self.website_html is not None: + if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024: + raise ValueError("Website HTML content exceeds maximum size of 2MB") + try: + soup = BeautifulSoup(self.website_html, "html.parser") + if not soup.find(): + raise ValueError("Invalid HTML - no parseable content found") + except Exception as e: + raise ValueError(f"Invalid HTML structure: {str(e)}") + elif self.website_url is not None: + if not self.website_url.strip(): + raise ValueError("Website URL cannot be empty") + if not ( + self.website_url.startswith("http://") + or self.website_url.startswith("https://") + ): + raise ValueError("Invalid URL") + else: + raise ValueError("Either website_url or website_html must be provided") return self def model_dump(self, *args, **kwargs) -> dict: From bb851d785d121b039d5e968327fb930955a3fd92 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Sat, 1 Feb 2025 16:35:28 +0100 Subject: [PATCH 2/7] feat: add optional headers to request --- .../examples/optional_headers_example.py | 28 +++++++++++++++++++ scrapegraph-py/scrapegraph_py/async_client.py | 12 ++++++-- scrapegraph-py/scrapegraph_py/client.py | 10 +++++-- .../scrapegraph_py/models/markdownify.py | 9 ++++++ .../scrapegraph_py/models/smartscraper.py | 8 ++++++ 5 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 scrapegraph-py/examples/optional_headers_example.py diff --git a/scrapegraph-py/examples/optional_headers_example.py b/scrapegraph-py/examples/optional_headers_example.py new file mode 100644 index 0000000..7763f8f --- /dev/null +++ b/scrapegraph-py/examples/optional_headers_example.py @@ -0,0 +1,28 @@ +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + +# Initialize the client with explicit API key +sgai_client = Client(api_key="your-api-key-here") + +# SmartScraper request +response = sgai_client.smartscraper( + website_url="https://example.com", + user_prompt="Extract the main heading, description, and summary of the webpage", + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + }, +) + + +# Print the response +print(f"Request ID: {response['request_id']}") +print(f"Result: {response['result']}") + +sgai_client.close() diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 6943fc6..dffe02d 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -132,11 +132,15 @@ async def _make_request(self, method: str, url: str, **kwargs) -> Any: logger.info(f"⏳ Waiting {retry_delay}s before retry {attempt + 2}") await asyncio.sleep(retry_delay) - async def markdownify(self, website_url: str): + async def markdownify( + self, website_url: str, headers: Optional[dict[str, str]] = None + ): """Send a markdownify request""" logger.info(f"🔍 Starting markdownify request for {website_url}") + if headers: + logger.debug("🔧 Using custom headers") - request = MarkdownifyRequest(website_url=website_url) + request = MarkdownifyRequest(website_url=website_url, headers=headers) logger.debug("✅ Request validation passed") result = await self._make_request( @@ -164,6 +168,7 @@ async def smartscraper( user_prompt: str, website_url: Optional[str] = None, website_html: Optional[str] = None, + headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, ): """Send a smartscraper request""" @@ -172,11 +177,14 @@ async def smartscraper( logger.debug(f"🌐 URL: {website_url}") if website_html: logger.debug("📄 Using provided HTML content") + if headers: + logger.debug("🔧 Using custom headers") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( website_url=website_url, website_html=website_html, + headers=headers, user_prompt=user_prompt, output_schema=output_schema, ) diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 3021549..860e254 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -144,11 +144,13 @@ def _make_request(self, method: str, url: str, **kwargs) -> Any: logger.error(f"🔴 Connection Error: {str(e)}") raise ConnectionError(f"Failed to connect to API: {str(e)}") - def markdownify(self, website_url: str): + def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None): """Send a markdownify request""" logger.info(f"🔍 Starting markdownify request for {website_url}") + if headers: + logger.debug("🔧 Using custom headers") - request = MarkdownifyRequest(website_url=website_url) + request = MarkdownifyRequest(website_url=website_url, headers=headers) logger.debug("✅ Request validation passed") result = self._make_request( @@ -174,6 +176,7 @@ def smartscraper( user_prompt: str, website_url: Optional[str] = None, website_html: Optional[str] = None, + headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, ): """Send a smartscraper request""" @@ -182,11 +185,14 @@ def smartscraper( logger.debug(f"🌐 URL: {website_url}") if website_html: logger.debug("📄 Using provided HTML content") + if headers: + logger.debug("🔧 Using custom headers") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( website_url=website_url, website_html=website_html, + headers=headers, user_prompt=user_prompt, output_schema=output_schema, ) diff --git a/scrapegraph-py/scrapegraph_py/models/markdownify.py b/scrapegraph-py/scrapegraph_py/models/markdownify.py index 5b12aa2..678ee6c 100644 --- a/scrapegraph-py/scrapegraph_py/models/markdownify.py +++ b/scrapegraph-py/scrapegraph_py/models/markdownify.py @@ -1,5 +1,6 @@ # Models for markdownify endpoint +from typing import Optional from uuid import UUID from pydantic import BaseModel, Field, model_validator @@ -7,6 +8,14 @@ class MarkdownifyRequest(BaseModel): website_url: str = Field(..., example="https://scrapegraphai.com/") + headers: Optional[dict[str, str]] = Field( + None, + example={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Cookie": "cookie1=value1; cookie2=value2", + }, + description="Optional headers to send with the request, including cookies and user agent", + ) @model_validator(mode="after") def validate_url(self) -> "MarkdownifyRequest": diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index e589346..21b346e 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -20,6 +20,14 @@ class SmartScraperRequest(BaseModel): example="

Title

Content

", description="HTML content, maximum size 2MB", ) + headers: Optional[dict[str, str]] = Field( + None, + example={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Cookie": "cookie1=value1; cookie2=value2", + }, + description="Optional headers to send with the request, including cookies and user agent", + ) output_schema: Optional[Type[BaseModel]] = None @model_validator(mode="after") From 9149ce85a78b503098f80910c20de69831030378 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Sat, 1 Feb 2025 16:43:37 +0100 Subject: [PATCH 3/7] chore(tests): updated tests --- scrapegraph-py/tests/test_async_client.py | 94 +++++++++++++++------- scrapegraph-py/tests/test_client.py | 94 +++++++++++++++------- scrapegraph-py/tests/test_models.py | 97 +++++++++++++---------- 3 files changed, 183 insertions(+), 102 deletions(-) diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index 78f9717..167cb9a 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -19,7 +19,7 @@ def mock_uuid(): @pytest.mark.asyncio -async def test_smartscraper(mock_api_key): +async def test_smartscraper_with_url(mock_api_key): with aioresponses() as mocked: mocked.post( "https://api.scrapegraphai.com/v1/smartscraper", @@ -38,6 +38,54 @@ async def test_smartscraper(mock_api_key): assert "description" in response["result"] +@pytest.mark.asyncio +async def test_smartscraper_with_html(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/smartscraper", + payload={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"description": "Test content."}, + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.smartscraper( + website_html="

Test content

", + user_prompt="Extract info", + ) + assert response["status"] == "completed" + assert "description" in response["result"] + + +@pytest.mark.asyncio +async def test_smartscraper_with_headers(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/smartscraper", + payload={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"description": "Example domain."}, + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.smartscraper( + website_url="https://example.com", + user_prompt="Describe this page.", + headers=headers, + ) + assert response["status"] == "completed" + assert "description" in response["result"] + + @pytest.mark.asyncio async def test_get_credits(mock_api_key): with aioresponses() as mocked: @@ -122,57 +170,43 @@ async def test_markdownify(mock_api_key): @pytest.mark.asyncio -async def test_get_markdownify(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@pytest.mark.asyncio -async def test_localscraper(mock_api_key): +async def test_markdownify_with_headers(mock_api_key): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/localscraper", + "https://api.scrapegraphai.com/v1/markdownify", payload={ "request_id": str(uuid4()), "status": "completed", - "result": {"extracted_info": "Test content"}, + "result": "# Example Page\n\nThis is markdown content.", }, ) + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + async with AsyncClient(api_key=mock_api_key) as client: - response = await client.localscraper( - user_prompt="Extract info", - website_html="

Test content

", + response = await client.markdownify( + website_url="https://example.com", headers=headers ) assert response["status"] == "completed" - assert "extracted_info" in response["result"] + assert "# Example Page" in response["result"] @pytest.mark.asyncio -async def test_get_localscraper(mock_api_key, mock_uuid): +async def test_get_markdownify(mock_api_key, mock_uuid): with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/localscraper/{mock_uuid}", + f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", payload={ "request_id": mock_uuid, "status": "completed", - "result": {"extracted_info": "Test content"}, + "result": "# Example Page\n\nThis is markdown content.", }, ) async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_localscraper(mock_uuid) + response = await client.get_markdownify(mock_uuid) assert response["status"] == "completed" assert response["request_id"] == mock_uuid diff --git a/scrapegraph-py/tests/test_client.py b/scrapegraph-py/tests/test_client.py index 23c1162..9cf15c9 100644 --- a/scrapegraph-py/tests/test_client.py +++ b/scrapegraph-py/tests/test_client.py @@ -18,7 +18,7 @@ def mock_uuid(): @responses.activate -def test_smartscraper(mock_api_key): +def test_smartscraper_with_url(mock_api_key): # Mock the API response responses.add( responses.POST, @@ -37,6 +37,54 @@ def test_smartscraper(mock_api_key): assert response["status"] == "completed" +@responses.activate +def test_smartscraper_with_html(mock_api_key): + # Mock the API response + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/smartscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"description": "Test content."}, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.smartscraper( + website_html="

Test content

", + user_prompt="Extract info", + ) + assert response["status"] == "completed" + + +@responses.activate +def test_smartscraper_with_headers(mock_api_key): + # Mock the API response + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/smartscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"description": "Example domain."}, + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + + with Client(api_key=mock_api_key) as client: + response = client.smartscraper( + website_url="https://example.com", + user_prompt="Describe this page.", + headers=headers, + ) + assert response["status"] == "completed" + + @responses.activate def test_get_smartscraper(mock_api_key, mock_uuid): responses.add( @@ -118,57 +166,43 @@ def test_markdownify(mock_api_key): @responses.activate -def test_get_markdownify(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@responses.activate -def test_localscraper(mock_api_key): +def test_markdownify_with_headers(mock_api_key): responses.add( responses.POST, - "https://api.scrapegraphai.com/v1/localscraper", + "https://api.scrapegraphai.com/v1/markdownify", json={ "request_id": str(uuid4()), "status": "completed", - "result": {"extracted_info": "Test content"}, + "result": "# Example Page\n\nThis is markdown content.", }, ) + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + with Client(api_key=mock_api_key) as client: - response = client.localscraper( - user_prompt="Extract info", - website_html="

Test content

", + response = client.markdownify( + website_url="https://example.com", headers=headers ) assert response["status"] == "completed" - assert "extracted_info" in response["result"] + assert "# Example Page" in response["result"] @responses.activate -def test_get_localscraper(mock_api_key, mock_uuid): +def test_get_markdownify(mock_api_key, mock_uuid): responses.add( responses.GET, - f"https://api.scrapegraphai.com/v1/localscraper/{mock_uuid}", + f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", json={ "request_id": mock_uuid, "status": "completed", - "result": {"extracted_info": "Test content"}, + "result": "# Example Page\n\nThis is markdown content.", }, ) with Client(api_key=mock_api_key) as client: - response = client.get_localscraper(mock_uuid) + response = client.get_markdownify(mock_uuid) assert response["status"] == "completed" assert response["request_id"] == mock_uuid diff --git a/scrapegraph-py/tests/test_models.py b/scrapegraph-py/tests/test_models.py index 70841aa..722de22 100644 --- a/scrapegraph-py/tests/test_models.py +++ b/scrapegraph-py/tests/test_models.py @@ -2,10 +2,6 @@ from pydantic import BaseModel, ValidationError from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.localscraper import ( - GetLocalScraperRequest, - LocalScraperRequest, -) from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, @@ -14,17 +10,40 @@ def test_smartscraper_request_validation(): - class ExampleSchema(BaseModel): name: str age: int - # Valid input + # Valid input with website_url request = SmartScraperRequest( website_url="https://example.com", user_prompt="Describe this page." ) assert request.website_url == "https://example.com" assert request.user_prompt == "Describe this page." + assert request.website_html is None + assert request.headers is None + + # Valid input with website_html + request = SmartScraperRequest( + website_html="

Test content

", + user_prompt="Extract info", + ) + assert request.website_url is None + assert request.website_html == "

Test content

" + assert request.user_prompt == "Extract info" + assert request.headers is None + + # Valid input with headers + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + request = SmartScraperRequest( + website_url="https://example.com", + user_prompt="Describe this page.", + headers=headers, + ) + assert request.headers == headers # Test with output_schema request = SmartScraperRequest( @@ -50,6 +69,25 @@ class ExampleSchema(BaseModel): with pytest.raises(ValidationError): SmartScraperRequest(website_url="https://example.com", user_prompt="") + # Invalid HTML + with pytest.raises(ValidationError): + SmartScraperRequest( + website_html="not valid html", + user_prompt="Extract info", + ) + + # HTML too large (>2MB) + large_html = "x" * (2 * 1024 * 1024 + 1) + with pytest.raises(ValidationError): + SmartScraperRequest( + website_html=large_html, + user_prompt="Extract info", + ) + + # Neither URL nor HTML provided + with pytest.raises(ValidationError): + SmartScraperRequest(user_prompt="Extract info") + def test_get_smartscraper_request_validation(): # Valid UUID @@ -88,9 +126,19 @@ def test_feedback_request_validation(): def test_markdownify_request_validation(): - # Valid input + # Valid input without headers request = MarkdownifyRequest(website_url="https://example.com") assert request.website_url == "https://example.com" + assert request.headers is None + + # Valid input with headers + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + request = MarkdownifyRequest(website_url="https://example.com", headers=headers) + assert request.website_url == "https://example.com" + assert request.headers == headers # Invalid URL with pytest.raises(ValidationError): @@ -109,38 +157,3 @@ def test_get_markdownify_request_validation(): # Invalid UUID with pytest.raises(ValidationError): GetMarkdownifyRequest(request_id="invalid-uuid") - - -def test_localscraper_request_validation(): - # Valid input - request = LocalScraperRequest( - user_prompt="Extract info", - website_html="

Test content

", - ) - assert request.user_prompt == "Extract info" - assert "

Test content

" in request.website_html - - # Empty prompt - with pytest.raises(ValidationError): - LocalScraperRequest( - user_prompt="", website_html="

Test content

" - ) - - # Invalid HTML - with pytest.raises(ValidationError): - LocalScraperRequest(user_prompt="Extract info", website_html="not valid html") - - # HTML too large (>2MB) - large_html = "x" * (2 * 1024 * 1024 + 1) - with pytest.raises(ValidationError): - LocalScraperRequest(user_prompt="Extract info", website_html=large_html) - - -def test_get_localscraper_request_validation(): - # Valid UUID - request = GetLocalScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetLocalScraperRequest(request_id="invalid-uuid") From 2e04e5a1bbd207a7ceeea594878bdea542a7a856 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Sat, 1 Feb 2025 17:19:18 +0100 Subject: [PATCH 4/7] feat: searchscraper --- .../examples/async_searchscraper_example.py | 46 +++++++ .../async_searchscraper_schema_example.py | 119 ++++++++++++++++++ .../examples/searchscraper_example.py | 26 ++++ .../examples/searchscraper_schema_example.py | 41 ++++++ ...ck_example.py => send_feedback_example.py} | 56 ++++----- scrapegraph-py/pyproject.toml | 4 +- scrapegraph-py/scrapegraph_py/async_client.py | 43 +++++++ scrapegraph-py/scrapegraph_py/client.py | 41 ++++++ .../scrapegraph_py/models/searchscraper.py | 49 ++++++++ scrapegraph-py/tests/test_async_client.py | 74 +++++++++++ scrapegraph-py/tests/test_client.py | 76 +++++++++++ scrapegraph-py/tests/test_models.py | 58 +++++++++ 12 files changed, 603 insertions(+), 30 deletions(-) create mode 100644 scrapegraph-py/examples/async_searchscraper_example.py create mode 100644 scrapegraph-py/examples/async_searchscraper_schema_example.py create mode 100644 scrapegraph-py/examples/searchscraper_example.py create mode 100644 scrapegraph-py/examples/searchscraper_schema_example.py rename scrapegraph-py/examples/{feedback_example.py => send_feedback_example.py} (96%) create mode 100644 scrapegraph-py/scrapegraph_py/models/searchscraper.py diff --git a/scrapegraph-py/examples/async_searchscraper_example.py b/scrapegraph-py/examples/async_searchscraper_example.py new file mode 100644 index 0000000..1aae8f9 --- /dev/null +++ b/scrapegraph-py/examples/async_searchscraper_example.py @@ -0,0 +1,46 @@ +""" +Example of using the async searchscraper functionality to search for information concurrently. +""" + +import asyncio + +from scrapegraph_py import AsyncClient +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + + +async def main(): + # Initialize async client + sgai_client = AsyncClient(api_key="your-api-key-here") + + # List of search queries + queries = [ + "What is the latest version of Python and what are its main features?", + "What are the key differences between Python 2 and Python 3?", + "What is Python's GIL and how does it work?", + ] + + # Create tasks for concurrent execution + tasks = [sgai_client.searchscraper(user_prompt=query) for query in queries] + + # Execute requests concurrently + responses = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + for i, response in enumerate(responses): + if isinstance(response, Exception): + print(f"\nError for query {i+1}: {response}") + else: + print(f"\nSearch {i+1}:") + print(f"Query: {queries[i]}") + print(f"Result: {response['result']}") + print("Reference URLs:") + for url in response["reference_urls"]: + print(f"- {url}") + + await sgai_client.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scrapegraph-py/examples/async_searchscraper_schema_example.py b/scrapegraph-py/examples/async_searchscraper_schema_example.py new file mode 100644 index 0000000..753a3e0 --- /dev/null +++ b/scrapegraph-py/examples/async_searchscraper_schema_example.py @@ -0,0 +1,119 @@ +""" +Example of using the async searchscraper functionality with output schemas for extraction. +""" + +import asyncio +from typing import List + +from pydantic import BaseModel + +from scrapegraph_py import AsyncClient +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + + +# Define schemas for extracting structured data +class PythonVersionInfo(BaseModel): + version: str + release_date: str + major_features: List[str] + + +class PythonComparison(BaseModel): + key_differences: List[str] + backward_compatible: bool + migration_difficulty: str + + +class GILInfo(BaseModel): + definition: str + purpose: str + limitations: List[str] + workarounds: List[str] + + +async def main(): + # Initialize async client + sgai_client = AsyncClient(api_key="your-api-key-here") + + # Define search queries with their corresponding schemas + searches = [ + { + "prompt": "What is the latest version of Python? Include the release date and main features.", + "schema": PythonVersionInfo, + }, + { + "prompt": "Compare Python 2 and Python 3, including backward compatibility and migration difficulty.", + "schema": PythonComparison, + }, + { + "prompt": "Explain Python's GIL, its purpose, limitations, and possible workarounds.", + "schema": GILInfo, + }, + ] + + # Create tasks for concurrent execution + tasks = [ + sgai_client.searchscraper( + user_prompt=search["prompt"], + output_schema=search["schema"], + ) + for search in searches + ] + + # Execute requests concurrently + responses = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + for i, response in enumerate(responses): + if isinstance(response, Exception): + print(f"\nError for search {i+1}: {response}") + else: + print(f"\nSearch {i+1}:") + print(f"Query: {searches[i]['prompt']}") + # print(f"Raw Result: {response['result']}") + + try: + # Try to extract structured data using the schema + result = searches[i]["schema"].model_validate(response["result"]) + + # Print extracted structured data + if isinstance(result, PythonVersionInfo): + print("\nExtracted Data:") + print(f"Python Version: {result.version}") + print(f"Release Date: {result.release_date}") + print("Major Features:") + for feature in result.major_features: + print(f"- {feature}") + + elif isinstance(result, PythonComparison): + print("\nExtracted Data:") + print("Key Differences:") + for diff in result.key_differences: + print(f"- {diff}") + print(f"Backward Compatible: {result.backward_compatible}") + print(f"Migration Difficulty: {result.migration_difficulty}") + + elif isinstance(result, GILInfo): + print("\nExtracted Data:") + print(f"Definition: {result.definition}") + print(f"Purpose: {result.purpose}") + print("Limitations:") + for limit in result.limitations: + print(f"- {limit}") + print("Workarounds:") + for workaround in result.workarounds: + print(f"- {workaround}") + except Exception as e: + print(f"\nCould not extract structured data: {e}") + + print("\nReference URLs:") + for url in response["reference_urls"]: + print(f"- {url}") + + await sgai_client.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scrapegraph-py/examples/searchscraper_example.py b/scrapegraph-py/examples/searchscraper_example.py new file mode 100644 index 0000000..2b1903d --- /dev/null +++ b/scrapegraph-py/examples/searchscraper_example.py @@ -0,0 +1,26 @@ +""" +Example of using the searchscraper functionality to search for information. +""" + +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + +# Initialize the client +client = Client(api_key="your-api-key-here") + +# Send a searchscraper request +response = client.searchscraper( + user_prompt="What is the latest version of Python and what are its main features?" +) + +# Print the results +print("\nResults:") +print(f"Answer: {response['result']}") +print("\nReference URLs:") +for url in response["reference_urls"]: + print(f"- {url}") + +# Close the client +client.close() diff --git a/scrapegraph-py/examples/searchscraper_schema_example.py b/scrapegraph-py/examples/searchscraper_schema_example.py new file mode 100644 index 0000000..8c678b2 --- /dev/null +++ b/scrapegraph-py/examples/searchscraper_schema_example.py @@ -0,0 +1,41 @@ +""" +Example of using the searchscraper functionality with a custom output schema. +""" + +from typing import List + +from pydantic import BaseModel + +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + + +# Define a custom schema for the output +class PythonVersionInfo(BaseModel): + version: str + release_date: str + major_features: List[str] + is_latest: bool + + +# Initialize the client +client = Client(api_key="your-api-key-here") + +# Send a searchscraper request with schema +response = client.searchscraper( + user_prompt="What is the latest version of Python? Include the release date and main features.", + output_schema=PythonVersionInfo, +) + +# The result will be structured according to our schema +print(f"Request ID: {response['request_id']}") +print(f"Result: {response['result']}") + +print("\nReference URLs:") +for url in response["reference_urls"]: + print(f"- {url}") + +# Close the client +client.close() diff --git a/scrapegraph-py/examples/feedback_example.py b/scrapegraph-py/examples/send_feedback_example.py similarity index 96% rename from scrapegraph-py/examples/feedback_example.py rename to scrapegraph-py/examples/send_feedback_example.py index dc20ae2..4c397ed 100644 --- a/scrapegraph-py/examples/feedback_example.py +++ b/scrapegraph-py/examples/send_feedback_example.py @@ -1,28 +1,28 @@ -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - -# Initialize the client -sgai_client = Client(api_key="your-api-key-here") - -# Example request_id (replace with an actual request_id from a previous request) -request_id = "your-request-id-here" - -# Check remaining credits -credits = sgai_client.get_credits() -print(f"Credits Info: {credits}") - -# Submit feedback for a previous request -feedback_response = sgai_client.submit_feedback( - request_id=request_id, - rating=5, # Rating from 1-5 - feedback_text="The extraction was accurate and exactly what I needed!", -) -print(f"\nFeedback Response: {feedback_response}") - -# Get previous results using get_smartscraper -previous_result = sgai_client.get_smartscraper(request_id=request_id) -print(f"\nRetrieved Previous Result: {previous_result}") - -sgai_client.close() +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + +# Initialize the client +sgai_client = Client(api_key="your-api-key-here") + +# Example request_id (replace with an actual request_id from a previous request) +request_id = "your-request-id-here" + +# Check remaining credits +credits = sgai_client.get_credits() +print(f"Credits Info: {credits}") + +# Submit feedback for a previous request +feedback_response = sgai_client.submit_feedback( + request_id=request_id, + rating=5, # Rating from 1-5 + feedback_text="The extraction was accurate and exactly what I needed!", +) +print(f"\nFeedback Response: {feedback_response}") + +# Get previous results using get_smartscraper +previous_result = sgai_client.get_smartscraper(request_id=request_id) +print(f"\nRetrieved Previous Result: {previous_result}") + +sgai_client.close() diff --git a/scrapegraph-py/pyproject.toml b/scrapegraph-py/pyproject.toml index da701fa..5bd8033 100644 --- a/scrapegraph-py/pyproject.toml +++ b/scrapegraph-py/pyproject.toml @@ -83,7 +83,7 @@ line-length = 88 [tool.ruff.lint] select = ["F", "E", "W", "C"] -ignore = ["E203", "E501"] # Ignore conflicts with Black +ignore = ["E203", "E501", "C901"] # Ignore conflicts with Black and function complexity [tool.mypy] python_version = "3.10" @@ -97,4 +97,4 @@ build-backend = "hatchling.build" [tool.poe.tasks] pylint-local = "pylint scrapegraph_py/**/*.py" -pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraph_py/**/*.py" +pylint-ci = "pylint --disable=C0114,C0115,C0116,C901 --exit-zero scrapegraph_py/**/*.py" diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index dffe02d..99b6212 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -10,6 +10,10 @@ from scrapegraph_py.logger import sgai_logger as logger from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest +from scrapegraph_py.models.searchscraper import ( + GetSearchScraperRequest, + SearchScraperRequest, +) from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, SmartScraperRequest, @@ -241,6 +245,45 @@ async def get_credits(self): ) return result + async def searchscraper( + self, + user_prompt: str, + headers: Optional[dict[str, str]] = None, + output_schema: Optional[BaseModel] = None, + ): + """Send a searchscraper request""" + logger.info("🔍 Starting searchscraper request") + logger.debug(f"📝 Prompt: {user_prompt}") + if headers: + logger.debug("🔧 Using custom headers") + + request = SearchScraperRequest( + user_prompt=user_prompt, + headers=headers, + output_schema=output_schema, + ) + logger.debug("✅ Request validation passed") + + result = await self._make_request( + "POST", f"{API_BASE_URL}/searchscraper", json=request.model_dump() + ) + logger.info("✨ Searchscraper request completed successfully") + return result + + async def get_searchscraper(self, request_id: str): + """Get the result of a previous searchscraper request""" + logger.info(f"🔍 Fetching searchscraper result for request {request_id}") + + # Validate input using Pydantic model + GetSearchScraperRequest(request_id=request_id) + logger.debug("✅ Request ID validation passed") + + result = await self._make_request( + "GET", f"{API_BASE_URL}/searchscraper/{request_id}" + ) + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + async def close(self): """Close the session to free up resources""" logger.info("🔒 Closing AsyncClient session") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 860e254..1168557 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -11,6 +11,10 @@ from scrapegraph_py.logger import sgai_logger as logger from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest +from scrapegraph_py.models.searchscraper import ( + GetSearchScraperRequest, + SearchScraperRequest, +) from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, SmartScraperRequest, @@ -247,6 +251,43 @@ def get_credits(self): ) return result + def searchscraper( + self, + user_prompt: str, + headers: Optional[dict[str, str]] = None, + output_schema: Optional[BaseModel] = None, + ): + """Send a searchscraper request""" + logger.info("🔍 Starting searchscraper request") + logger.debug(f"📝 Prompt: {user_prompt}") + if headers: + logger.debug("🔧 Using custom headers") + + request = SearchScraperRequest( + user_prompt=user_prompt, + headers=headers, + output_schema=output_schema, + ) + logger.debug("✅ Request validation passed") + + result = self._make_request( + "POST", f"{API_BASE_URL}/searchscraper", json=request.model_dump() + ) + logger.info("✨ Searchscraper request completed successfully") + return result + + def get_searchscraper(self, request_id: str): + """Get the result of a previous searchscraper request""" + logger.info(f"🔍 Fetching searchscraper result for request {request_id}") + + # Validate input using Pydantic model + GetSearchScraperRequest(request_id=request_id) + logger.debug("✅ Request ID validation passed") + + result = self._make_request("GET", f"{API_BASE_URL}/searchscraper/{request_id}") + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + def close(self): """Close the session to free up resources""" logger.info("🔒 Closing Client session") diff --git a/scrapegraph-py/scrapegraph_py/models/searchscraper.py b/scrapegraph-py/scrapegraph_py/models/searchscraper.py new file mode 100644 index 0000000..997d407 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/searchscraper.py @@ -0,0 +1,49 @@ +# Models for searchscraper endpoint + +from typing import Optional, Type +from uuid import UUID + +from pydantic import BaseModel, Field, model_validator + + +class SearchScraperRequest(BaseModel): + user_prompt: str = Field(..., example="What is the latest version of Python?") + headers: Optional[dict[str, str]] = Field( + None, + example={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Cookie": "cookie1=value1; cookie2=value2", + }, + description="Optional headers to send with the request, including cookies and user agent", + ) + output_schema: Optional[Type[BaseModel]] = None + + @model_validator(mode="after") + def validate_user_prompt(self) -> "SearchScraperRequest": + if self.user_prompt is None or not self.user_prompt.strip(): + raise ValueError("User prompt cannot be empty") + if not any(c.isalnum() for c in self.user_prompt): + raise ValueError("User prompt must contain a valid prompt") + return self + + def model_dump(self, *args, **kwargs) -> dict: + data = super().model_dump(*args, **kwargs) + # Convert the Pydantic model schema to dict if present + if self.output_schema is not None: + data["output_schema"] = self.output_schema.model_json_schema() + return data + + +class GetSearchScraperRequest(BaseModel): + """Request model for get_searchscraper endpoint""" + + request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") + + @model_validator(mode="after") + def validate_request_id(self) -> "GetSearchScraperRequest": + try: + # Validate the request_id is a valid UUID + UUID(self.request_id) + except ValueError: + raise ValueError("request_id must be a valid UUID") + return self diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index 167cb9a..69c067e 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -210,3 +210,77 @@ async def test_get_markdownify(mock_api_key, mock_uuid): response = await client.get_markdownify(mock_uuid) assert response["status"] == "completed" assert response["request_id"] == mock_uuid + + +@pytest.mark.asyncio +async def test_searchscraper(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/searchscraper", + payload={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"answer": "Python 3.12 is the latest version."}, + "reference_urls": ["https://www.python.org/downloads/"], + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.searchscraper( + user_prompt="What is the latest version of Python?" + ) + assert response["status"] == "completed" + assert "answer" in response["result"] + assert "reference_urls" in response + assert isinstance(response["reference_urls"], list) + + +@pytest.mark.asyncio +async def test_searchscraper_with_headers(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/searchscraper", + payload={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"answer": "Python 3.12 is the latest version."}, + "reference_urls": ["https://www.python.org/downloads/"], + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.searchscraper( + user_prompt="What is the latest version of Python?", + headers=headers, + ) + assert response["status"] == "completed" + assert "answer" in response["result"] + assert "reference_urls" in response + assert isinstance(response["reference_urls"], list) + + +@pytest.mark.asyncio +async def test_get_searchscraper(mock_api_key, mock_uuid): + with aioresponses() as mocked: + mocked.get( + f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", + payload={ + "request_id": mock_uuid, + "status": "completed", + "result": {"answer": "Python 3.12 is the latest version."}, + "reference_urls": ["https://www.python.org/downloads/"], + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.get_searchscraper(mock_uuid) + assert response["status"] == "completed" + assert response["request_id"] == mock_uuid + assert "answer" in response["result"] + assert "reference_urls" in response + assert isinstance(response["reference_urls"], list) diff --git a/scrapegraph-py/tests/test_client.py b/scrapegraph-py/tests/test_client.py index 9cf15c9..11ef12f 100644 --- a/scrapegraph-py/tests/test_client.py +++ b/scrapegraph-py/tests/test_client.py @@ -206,3 +206,79 @@ def test_get_markdownify(mock_api_key, mock_uuid): response = client.get_markdownify(mock_uuid) assert response["status"] == "completed" assert response["request_id"] == mock_uuid + + +@responses.activate +def test_searchscraper(mock_api_key): + # Mock the API response + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/searchscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"answer": "Python 3.12 is the latest version."}, + "reference_urls": ["https://www.python.org/downloads/"], + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.searchscraper( + user_prompt="What is the latest version of Python?" + ) + assert response["status"] == "completed" + assert "answer" in response["result"] + assert "reference_urls" in response + assert isinstance(response["reference_urls"], list) + + +@responses.activate +def test_searchscraper_with_headers(mock_api_key): + # Mock the API response + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/searchscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"answer": "Python 3.12 is the latest version."}, + "reference_urls": ["https://www.python.org/downloads/"], + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + + with Client(api_key=mock_api_key) as client: + response = client.searchscraper( + user_prompt="What is the latest version of Python?", + headers=headers, + ) + assert response["status"] == "completed" + assert "answer" in response["result"] + assert "reference_urls" in response + assert isinstance(response["reference_urls"], list) + + +@responses.activate +def test_get_searchscraper(mock_api_key, mock_uuid): + responses.add( + responses.GET, + f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", + json={ + "request_id": mock_uuid, + "status": "completed", + "result": {"answer": "Python 3.12 is the latest version."}, + "reference_urls": ["https://www.python.org/downloads/"], + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.get_searchscraper(mock_uuid) + assert response["status"] == "completed" + assert response["request_id"] == mock_uuid + assert "answer" in response["result"] + assert "reference_urls" in response + assert isinstance(response["reference_urls"], list) diff --git a/scrapegraph-py/tests/test_models.py b/scrapegraph-py/tests/test_models.py index 722de22..50c788f 100644 --- a/scrapegraph-py/tests/test_models.py +++ b/scrapegraph-py/tests/test_models.py @@ -3,6 +3,10 @@ from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest +from scrapegraph_py.models.searchscraper import ( + GetSearchScraperRequest, + SearchScraperRequest, +) from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, SmartScraperRequest, @@ -157,3 +161,57 @@ def test_get_markdownify_request_validation(): # Invalid UUID with pytest.raises(ValidationError): GetMarkdownifyRequest(request_id="invalid-uuid") + + +def test_searchscraper_request_validation(): + class ExampleSchema(BaseModel): + name: str + age: int + + # Valid input without headers + request = SearchScraperRequest(user_prompt="What is the latest version of Python?") + assert request.user_prompt == "What is the latest version of Python?" + assert request.headers is None + assert request.output_schema is None + + # Valid input with headers + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + request = SearchScraperRequest( + user_prompt="What is the latest version of Python?", + headers=headers, + ) + assert request.headers == headers + + # Test with output_schema + request = SearchScraperRequest( + user_prompt="What is the latest version of Python?", + output_schema=ExampleSchema, + ) + + # When we dump the model, the output_schema should be converted to a dict + dumped = request.model_dump() + assert isinstance(dumped["output_schema"], dict) + assert "properties" in dumped["output_schema"] + assert "name" in dumped["output_schema"]["properties"] + assert "age" in dumped["output_schema"]["properties"] + + # Empty prompt + with pytest.raises(ValidationError): + SearchScraperRequest(user_prompt="") + + # Invalid prompt (no alphanumeric characters) + with pytest.raises(ValidationError): + SearchScraperRequest(user_prompt="!@#$%^") + + +def test_get_searchscraper_request_validation(): + # Valid UUID + request = GetSearchScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") + assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" + + # Invalid UUID + with pytest.raises(ValidationError): + GetSearchScraperRequest(request_id="invalid-uuid") From 8e008465f7280c53e2faab7a92f02871ffc5b867 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Mon, 3 Feb 2025 16:30:20 +0100 Subject: [PATCH 5/7] chore: refactor examples --- scrapegraph-py/examples/{ => async}/async_markdownify_example.py | 0 .../examples/{ => async}/async_searchscraper_example.py | 0 .../examples/{ => async}/async_searchscraper_schema_example.py | 0 scrapegraph-py/examples/{ => async}/async_smartscraper_example.py | 0 .../examples/{ => async}/async_smartscraper_schema_example.py | 0 .../examples/{ => miscellaneous}/get_credits_example.py | 0 .../examples/{ => miscellaneous}/optional_headers_example.py | 0 .../examples/{ => miscellaneous}/send_feedback_example.py | 0 scrapegraph-py/examples/{ => sync}/markdownify_example.py | 0 scrapegraph-py/examples/{ => sync}/searchscraper_example.py | 0 .../examples/{ => sync}/searchscraper_schema_example.py | 0 scrapegraph-py/examples/{ => sync}/smartscraper_example.py | 0 scrapegraph-py/examples/{ => sync}/smartscraper_schema_example.py | 0 13 files changed, 0 insertions(+), 0 deletions(-) rename scrapegraph-py/examples/{ => async}/async_markdownify_example.py (100%) rename scrapegraph-py/examples/{ => async}/async_searchscraper_example.py (100%) rename scrapegraph-py/examples/{ => async}/async_searchscraper_schema_example.py (100%) rename scrapegraph-py/examples/{ => async}/async_smartscraper_example.py (100%) rename scrapegraph-py/examples/{ => async}/async_smartscraper_schema_example.py (100%) rename scrapegraph-py/examples/{ => miscellaneous}/get_credits_example.py (100%) rename scrapegraph-py/examples/{ => miscellaneous}/optional_headers_example.py (100%) rename scrapegraph-py/examples/{ => miscellaneous}/send_feedback_example.py (100%) rename scrapegraph-py/examples/{ => sync}/markdownify_example.py (100%) rename scrapegraph-py/examples/{ => sync}/searchscraper_example.py (100%) rename scrapegraph-py/examples/{ => sync}/searchscraper_schema_example.py (100%) rename scrapegraph-py/examples/{ => sync}/smartscraper_example.py (100%) rename scrapegraph-py/examples/{ => sync}/smartscraper_schema_example.py (100%) diff --git a/scrapegraph-py/examples/async_markdownify_example.py b/scrapegraph-py/examples/async/async_markdownify_example.py similarity index 100% rename from scrapegraph-py/examples/async_markdownify_example.py rename to scrapegraph-py/examples/async/async_markdownify_example.py diff --git a/scrapegraph-py/examples/async_searchscraper_example.py b/scrapegraph-py/examples/async/async_searchscraper_example.py similarity index 100% rename from scrapegraph-py/examples/async_searchscraper_example.py rename to scrapegraph-py/examples/async/async_searchscraper_example.py diff --git a/scrapegraph-py/examples/async_searchscraper_schema_example.py b/scrapegraph-py/examples/async/async_searchscraper_schema_example.py similarity index 100% rename from scrapegraph-py/examples/async_searchscraper_schema_example.py rename to scrapegraph-py/examples/async/async_searchscraper_schema_example.py diff --git a/scrapegraph-py/examples/async_smartscraper_example.py b/scrapegraph-py/examples/async/async_smartscraper_example.py similarity index 100% rename from scrapegraph-py/examples/async_smartscraper_example.py rename to scrapegraph-py/examples/async/async_smartscraper_example.py diff --git a/scrapegraph-py/examples/async_smartscraper_schema_example.py b/scrapegraph-py/examples/async/async_smartscraper_schema_example.py similarity index 100% rename from scrapegraph-py/examples/async_smartscraper_schema_example.py rename to scrapegraph-py/examples/async/async_smartscraper_schema_example.py diff --git a/scrapegraph-py/examples/get_credits_example.py b/scrapegraph-py/examples/miscellaneous/get_credits_example.py similarity index 100% rename from scrapegraph-py/examples/get_credits_example.py rename to scrapegraph-py/examples/miscellaneous/get_credits_example.py diff --git a/scrapegraph-py/examples/optional_headers_example.py b/scrapegraph-py/examples/miscellaneous/optional_headers_example.py similarity index 100% rename from scrapegraph-py/examples/optional_headers_example.py rename to scrapegraph-py/examples/miscellaneous/optional_headers_example.py diff --git a/scrapegraph-py/examples/send_feedback_example.py b/scrapegraph-py/examples/miscellaneous/send_feedback_example.py similarity index 100% rename from scrapegraph-py/examples/send_feedback_example.py rename to scrapegraph-py/examples/miscellaneous/send_feedback_example.py diff --git a/scrapegraph-py/examples/markdownify_example.py b/scrapegraph-py/examples/sync/markdownify_example.py similarity index 100% rename from scrapegraph-py/examples/markdownify_example.py rename to scrapegraph-py/examples/sync/markdownify_example.py diff --git a/scrapegraph-py/examples/searchscraper_example.py b/scrapegraph-py/examples/sync/searchscraper_example.py similarity index 100% rename from scrapegraph-py/examples/searchscraper_example.py rename to scrapegraph-py/examples/sync/searchscraper_example.py diff --git a/scrapegraph-py/examples/searchscraper_schema_example.py b/scrapegraph-py/examples/sync/searchscraper_schema_example.py similarity index 100% rename from scrapegraph-py/examples/searchscraper_schema_example.py rename to scrapegraph-py/examples/sync/searchscraper_schema_example.py diff --git a/scrapegraph-py/examples/smartscraper_example.py b/scrapegraph-py/examples/sync/smartscraper_example.py similarity index 100% rename from scrapegraph-py/examples/smartscraper_example.py rename to scrapegraph-py/examples/sync/smartscraper_example.py diff --git a/scrapegraph-py/examples/smartscraper_schema_example.py b/scrapegraph-py/examples/sync/smartscraper_schema_example.py similarity index 100% rename from scrapegraph-py/examples/smartscraper_schema_example.py rename to scrapegraph-py/examples/sync/smartscraper_schema_example.py From bfdbea038918d79df2e3e9442e25d5f08bbccbbc Mon Sep 17 00:00:00 2001 From: PeriniM Date: Mon, 3 Feb 2025 17:32:40 +0100 Subject: [PATCH 6/7] feat: updated readmes --- README.md | 21 ++++++----- scrapegraph-py/README.md | 79 ++++++++++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index e4f9bad..3d15162 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ ScrapeGraph API Banner

-Official SDKs for the ScrapeGraph AI API - Intelligent web scraping powered by AI. Extract structured data from any webpage with natural language prompts. +Official SDKs for the ScrapeGraph AI API - Intelligent web scraping and search powered by AI. Extract structured data from any webpage or perform AI-powered web searches with natural language prompts. Get your [API key](https://scrapegraphai.com)! @@ -17,7 +17,7 @@ Get your [API key](https://scrapegraphai.com)! - [Python SDK Documentation](scrapegraph-py/README.md) - [JavaScript SDK Documentation](scrapegraph-js/README.md) -- [API Documentation](https://docs.scrapegraphai.com) +- [API Documentation](https://docs.scrapegraphai.com) - [Website](https://scrapegraphai.com) ## 📦 Installation @@ -34,7 +34,7 @@ npm install scrapegraph-js ## 🎯 Core Features -- 🤖 **AI-Powered Extraction**: Use natural language to describe what data you want +- 🤖 **AI-Powered Extraction & Search**: Use natural language to extract data or search the web - 📊 **Structured Output**: Get clean, structured data with optional schema validation - 🔄 **Multiple Formats**: Extract data as JSON, Markdown, or custom schemas - ⚡ **High Performance**: Concurrent processing and automatic retries @@ -43,22 +43,22 @@ npm install scrapegraph-js ## 🛠️ Available Endpoints ### 🔍 SmartScraper -Extract structured data from any webpage using natural language prompts. +Using AI to extract structured data from any webpage or HTML content with natural language prompts. + +### 🔎 SearchScraper +Perform AI-powered web searches with structured results and reference URLs. ### 📝 Markdownify Convert any webpage into clean, formatted markdown. -### 💻 LocalScraper -Extract information from a local HTML file using AI. - - ## 🌟 Key Benefits - 📝 **Natural Language Queries**: No complex selectors or XPath needed - 🎯 **Precise Extraction**: AI understands context and structure -- 🔄 **Adaptive Scraping**: Works with dynamic and static content +- 🔄 **Adaptive Processing**: Works with both web content and direct HTML - 📊 **Schema Validation**: Ensure data consistency with Pydantic/TypeScript - ⚡ **Async Support**: Handle multiple requests efficiently +- 🔍 **Source Attribution**: Get reference URLs for search results ## 💡 Use Cases @@ -67,13 +67,14 @@ Extract information from a local HTML file using AI. - 📰 **Content Aggregation**: Convert articles to structured formats - 🔍 **Data Mining**: Extract specific information from multiple sources - 📱 **App Integration**: Feed clean data into your applications +- 🌐 **Web Research**: Perform AI-powered searches with structured results ## 📖 Documentation For detailed documentation and examples, visit: - [Python SDK Guide](scrapegraph-py/README.md) - [JavaScript SDK Guide](scrapegraph-js/README.md) -- [API Documentation](https://docs.scrapegraphai.com) +- [API Documentation](https://docs.scrapegraphai.com) ## 💬 Support & Feedback diff --git a/scrapegraph-py/README.md b/scrapegraph-py/README.md index e530452..7f5d638 100644 --- a/scrapegraph-py/README.md +++ b/scrapegraph-py/README.md @@ -4,7 +4,7 @@ [![Python Support](https://img.shields.io/pypi/pyversions/scrapegraph-py.svg)](https://pypi.org/project/scrapegraph-py/) [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Documentation Status](https://readthedocs.org/projects/scrapegraph-py/badge/?version=latest)](https://docs.scrapegraphai.com) +[![Documentation Status](https://readthedocs.org/projects/scrapegraph-py/badge/?version=latest)](https://docs.scrapegraphai.com)

ScrapeGraph API Banner @@ -20,7 +20,7 @@ pip install scrapegraph-py ## 🚀 Features -- 🤖 AI-powered web scraping +- 🤖 AI-powered web scraping and search - 🔄 Both sync and async clients - 📊 Structured output with Pydantic schemas - 🔍 Detailed logging @@ -42,19 +42,34 @@ client = Client(api_key="your-api-key-here") ### 🔍 SmartScraper -Scrapes any webpage using AI to extract specific information. +Extract structured data from any webpage or HTML content using AI. ```python from scrapegraph_py import Client client = Client(api_key="your-api-key-here") -# Basic usage +# Using a URL response = client.smartscraper( website_url="https://example.com", user_prompt="Extract the main heading and description" ) +# Or using HTML content +html_content = """ + + +

Company Name

+

We are a technology company focused on AI solutions.

+ + +""" + +response = client.smartscraper( + website_html=html_content, + user_prompt="Extract the company description" +) + print(response) ``` @@ -80,46 +95,56 @@ response = client.smartscraper( -### 📝 Markdownify +### 🔎 SearchScraper -Converts any webpage into clean, formatted markdown. +Perform AI-powered web searches with structured results and reference URLs. ```python from scrapegraph_py import Client client = Client(api_key="your-api-key-here") -response = client.markdownify( - website_url="https://example.com" +response = client.searchscraper( + user_prompt="What is the latest version of Python and its main features?" ) -print(response) +print(f"Answer: {response['result']}") +print(f"Sources: {response['reference_urls']}") ``` -### 💻 LocalScraper - -Extracts information from HTML content using AI. +
+Output Schema (Optional) ```python +from pydantic import BaseModel, Field from scrapegraph_py import Client client = Client(api_key="your-api-key-here") -html_content = """ - - -

Company Name

-

We are a technology company focused on AI solutions.

-
-

Email: contact@example.com

-
- - -""" +class PythonVersionInfo(BaseModel): + version: str = Field(description="The latest Python version number") + release_date: str = Field(description="When this version was released") + major_features: list[str] = Field(description="List of main features") + +response = client.searchscraper( + user_prompt="What is the latest version of Python and its main features?", + output_schema=PythonVersionInfo +) +``` + +
-response = client.localscraper( - user_prompt="Extract the company description", - website_html=html_content +### 📝 Markdownify + +Converts any webpage into clean, formatted markdown. + +```python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key-here") + +response = client.markdownify( + website_url="https://example.com" ) print(response) @@ -177,7 +202,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## 🔗 Links - [Website](https://scrapegraphai.com) -- [Documentation](https://docs.scrapegraphai.com) +- [Documentation](https://docs.scrapegraphai.com) - [GitHub](https://github.com/ScrapeGraphAI/scrapegraph-sdk) --- From bcb9b0b731b057d242fdf80b43d96879ff7a2764 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Mon, 3 Feb 2025 17:37:27 +0100 Subject: [PATCH 7/7] feat: modified icons --- README.md | 4 ++-- scrapegraph-py/README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3d15162..e42adbd 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,10 @@ npm install scrapegraph-js ## 🛠️ Available Endpoints -### 🔍 SmartScraper +### 🤖 SmartScraper Using AI to extract structured data from any webpage or HTML content with natural language prompts. -### 🔎 SearchScraper +### 🔍 SearchScraper Perform AI-powered web searches with structured results and reference URLs. ### 📝 Markdownify diff --git a/scrapegraph-py/README.md b/scrapegraph-py/README.md index 7f5d638..bb4c3ce 100644 --- a/scrapegraph-py/README.md +++ b/scrapegraph-py/README.md @@ -40,7 +40,7 @@ client = Client(api_key="your-api-key-here") ## 📚 Available Endpoints -### 🔍 SmartScraper +### 🤖 SmartScraper Extract structured data from any webpage or HTML content using AI. @@ -95,7 +95,7 @@ response = client.smartscraper( -### 🔎 SearchScraper +### 🔍 SearchScraper Perform AI-powered web searches with structured results and reference URLs.