Recursively crawl websites and extract structured content
npm install scrapester
from scrapester import ScrapesterApp app = ScrapesterApp(api_key="sk-YOUR_API_KEY") # Crawl a website crawl_status = app.crawl_url( 'https://example.com', params={ 'limit': 100, 'scrapeOptions': { 'formats': ['markdown', 'html'] } }, poll_interval=30 ) print(crawl_status)
{ "success": true, "id": "crawl-123-456-789", "url": "https://api.scrapester.lol/v1/crawl/crawl-123-456-789" }
# Check crawl status status = app.check_crawl_status("<crawl_id>") print(status)
{ "status": "scraping", "total": 36, "completed": 10, "creditsUsed": 10, "expiresAt": "2024-00-00T00:00:00.000Z", "next": "https://api.scrapester.lol/v1/crawl/123-456-789?skip=10", "data": [ { "markdown": "# Example Page\nContent here...", "html": "<!DOCTYPE html><html>...</html>", "metadata": { "title": "Example Page", "language": "en", "sourceURL": "https://example.com/page-1", "description": "Page description", "statusCode": 200 } } ] }
import asyncio from scrapester import ScrapesterApp app = ScrapesterApp(api_key="sk-YOUR_API_KEY") def on_document(detail): print("Document:", detail) def on_error(detail): print("Error:", detail['error']) def on_done(detail): print("Completed:", detail['status']) async def start_crawl(): watcher = app.crawl_url_and_watch( 'example.com', { 'excludePaths': ['blog/*'], 'limit': 5 } ) watcher.add_event_listener("document", on_document) watcher.add_event_listener("error", on_error) watcher.add_event_listener("done", on_done) await watcher.connect() # Run the crawl asyncio.run(start_crawl())
curl -X POST https://api.scrapester.lol/v1/crawl \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-YOUR_API_KEY' \ -d '{ "url": "https://example.com", "limit": 100, "webhook": "https://your-domain.com/webhook" }'
crawl.started
crawl.page
crawl.completed
crawl.failed
{ "success": true, "type": "crawl.page", "id": "crawl-123-456-789", "data": [{ "markdown": "# Page Content...", "html": "<!DOCTYPE html>...", "metadata": { "title": "Page Title", "language": "en", "sourceURL": "https://example.com/page-1", "description": "Page description", "statusCode": 200 } }] }