Skip to main content

TypeScript: Save Successful Pages

import { LLMLayerClient } from 'llmlayer';
import { mkdir, writeFile } from 'node:fs/promises';
import { createHash } from 'node:crypto';

const client = new LLMLayerClient({
  apiKey: process.env.LLMLAYER_API_KEY,
});

await mkdir('crawl-output', { recursive: true });

for await (const event of client.crawlStream({
  url: 'https://www.ycombinator.com',
  maxPages: 25,
  maxDepth: 2,
  mainContentOnly: true,
})) {
  if (event.type === 'page') {
    const page = event.page;
    if (!page.success || !page.markdown) {
      console.warn('Skipped:', page.final_url, page.error);
      continue;
    }

    const id = createHash('sha1').update(page.final_url || page.requested_url || '').digest('hex');
    await writeFile(`crawl-output/${id}.md`, page.markdown);
  }

  if (event.type === 'usage') {
    console.log(`Billed pages: ${event.billed_count}, cost: $${event.cost}`);
  }

  if (event.type === 'error') {
    console.error(event.error);
  }
}

Python: Stream Into Memory

from llmlayer import LLMLayerClient

client = LLMLayerClient(api_key="YOUR_LLMLAYER_API_KEY")

pages = []

for event in client.crawl_stream(
    "https://www.ycombinator.com",
    max_pages=25,
    max_depth=2,
    main_content_only=True,
):
    event_type = event.get("type")

    if event_type == "page":
        page = event["page"]
        if page.get("success") and page.get("markdown"):
            pages.append(page)
        else:
            print("Skipped:", page.get("final_url"), page.get("error"))

    elif event_type == "usage":
        print("Billed pages:", event.get("billed_count"))
        print("Cost:", event.get("cost"))

    elif event_type == "error":
        print("Crawl error:", event.get("error"))

print("Saved pages:", len(pages))

Python: Async Crawl

import asyncio
from llmlayer import LLMLayerClient


async def main():
    client = LLMLayerClient(api_key="YOUR_LLMLAYER_API_KEY")

    async for event in client.crawl_stream_async(
        "https://www.ycombinator.com",
        max_pages=10,
        max_depth=1,
        main_content_only=True,
    ):
        if event.get("type") == "page":
            page = event["page"]
            print(page.get("final_url"), bool(page.get("markdown")))


asyncio.run(main())

Notes

  • Crawl currently returns markdown page content only.
  • Use Map first if you need to inspect or filter URLs before fetching content.
  • Use the usage event as the request-level cost summary.