Crawl API - Stream markdown pages from a website

curl --request POST \
  --url https://api.llmlayer.dev/api/v2/crawl_stream \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url": "https://www.ycombinator.com",
  "max_pages": 25,
  "max_depth": 2,
  "timeout": 60,
  "include_subdomains": false,
  "include_links": true,
  "include_images": true,
  "advanced_proxy": false,
  "main_content_only": false,
  "formats": [
    "markdown"
  ]
}
'

import requests

url = "https://api.llmlayer.dev/api/v2/crawl_stream"

payload = {
    "url": "https://www.ycombinator.com",
    "max_pages": 25,
    "max_depth": 2,
    "timeout": 60,
    "include_subdomains": False,
    "include_links": True,
    "include_images": True,
    "advanced_proxy": False,
    "main_content_only": False,
    "formats": ["markdown"]
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url: 'https://www.ycombinator.com',
    max_pages: 25,
    max_depth: 2,
    timeout: 60,
    include_subdomains: false,
    include_links: true,
    include_images: true,
    advanced_proxy: false,
    main_content_only: false,
    formats: ['markdown']
  })
};

fetch('https://api.llmlayer.dev/api/v2/crawl_stream', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.llmlayer.dev/api/v2/crawl_stream",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url' => 'https://www.ycombinator.com',
    'max_pages' => 25,
    'max_depth' => 2,
    'timeout' => 60,
    'include_subdomains' => false,
    'include_links' => true,
    'include_images' => true,
    'advanced_proxy' => false,
    'main_content_only' => false,
    'formats' => [
        'markdown'
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.llmlayer.dev/api/v2/crawl_stream"

	payload := strings.NewReader("{\n  \"url\": \"https://www.ycombinator.com\",\n  \"max_pages\": 25,\n  \"max_depth\": 2,\n  \"timeout\": 60,\n  \"include_subdomains\": false,\n  \"include_links\": true,\n  \"include_images\": true,\n  \"advanced_proxy\": false,\n  \"main_content_only\": false,\n  \"formats\": [\n    \"markdown\"\n  ]\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.llmlayer.dev/api/v2/crawl_stream")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url\": \"https://www.ycombinator.com\",\n  \"max_pages\": 25,\n  \"max_depth\": 2,\n  \"timeout\": 60,\n  \"include_subdomains\": false,\n  \"include_links\": true,\n  \"include_images\": true,\n  \"advanced_proxy\": false,\n  \"main_content_only\": false,\n  \"formats\": [\n    \"markdown\"\n  ]\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.llmlayer.dev/api/v2/crawl_stream")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url\": \"https://www.ycombinator.com\",\n  \"max_pages\": 25,\n  \"max_depth\": 2,\n  \"timeout\": 60,\n  \"include_subdomains\": false,\n  \"include_links\": true,\n  \"include_images\": true,\n  \"advanced_proxy\": false,\n  \"main_content_only\": false,\n  \"formats\": [\n    \"markdown\"\n  ]\n}"

response = http.request(request)
puts response.read_body

"data: {\"type\":\"page\",\"page\":{\"requested_url\":\"https://www.ycombinator.com\",\"final_url\":\"https://www.ycombinator.com\",\"title\":\"Y Combinator\",\"hash_sha256\":\"abc123...\",\"markdown\":\"# Content...\",\"success\":true}}\n\n"

POST

api

crawl_stream

Crawl API - Stream markdown pages from a website

curl --request POST \
  --url https://api.llmlayer.dev/api/v2/crawl_stream \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url": "https://www.ycombinator.com",
  "max_pages": 25,
  "max_depth": 2,
  "timeout": 60,
  "include_subdomains": false,
  "include_links": true,
  "include_images": true,
  "advanced_proxy": false,
  "main_content_only": false,
  "formats": [
    "markdown"
  ]
}
'

import requests

url = "https://api.llmlayer.dev/api/v2/crawl_stream"

payload = {
    "url": "https://www.ycombinator.com",
    "max_pages": 25,
    "max_depth": 2,
    "timeout": 60,
    "include_subdomains": False,
    "include_links": True,
    "include_images": True,
    "advanced_proxy": False,
    "main_content_only": False,
    "formats": ["markdown"]
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url: 'https://www.ycombinator.com',
    max_pages: 25,
    max_depth: 2,
    timeout: 60,
    include_subdomains: false,
    include_links: true,
    include_images: true,
    advanced_proxy: false,
    main_content_only: false,
    formats: ['markdown']
  })
};

fetch('https://api.llmlayer.dev/api/v2/crawl_stream', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.llmlayer.dev/api/v2/crawl_stream",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url' => 'https://www.ycombinator.com',
    'max_pages' => 25,
    'max_depth' => 2,
    'timeout' => 60,
    'include_subdomains' => false,
    'include_links' => true,
    'include_images' => true,
    'advanced_proxy' => false,
    'main_content_only' => false,
    'formats' => [
        'markdown'
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.llmlayer.dev/api/v2/crawl_stream"

	payload := strings.NewReader("{\n  \"url\": \"https://www.ycombinator.com\",\n  \"max_pages\": 25,\n  \"max_depth\": 2,\n  \"timeout\": 60,\n  \"include_subdomains\": false,\n  \"include_links\": true,\n  \"include_images\": true,\n  \"advanced_proxy\": false,\n  \"main_content_only\": false,\n  \"formats\": [\n    \"markdown\"\n  ]\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.llmlayer.dev/api/v2/crawl_stream")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url\": \"https://www.ycombinator.com\",\n  \"max_pages\": 25,\n  \"max_depth\": 2,\n  \"timeout\": 60,\n  \"include_subdomains\": false,\n  \"include_links\": true,\n  \"include_images\": true,\n  \"advanced_proxy\": false,\n  \"main_content_only\": false,\n  \"formats\": [\n    \"markdown\"\n  ]\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.llmlayer.dev/api/v2/crawl_stream")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url\": \"https://www.ycombinator.com\",\n  \"max_pages\": 25,\n  \"max_depth\": 2,\n  \"timeout\": 60,\n  \"include_subdomains\": false,\n  \"include_links\": true,\n  \"include_images\": true,\n  \"advanced_proxy\": false,\n  \"main_content_only\": false,\n  \"formats\": [\n    \"markdown\"\n  ]\n}"

response = http.request(request)
puts response.read_body

"data: {\"type\":\"page\",\"page\":{\"requested_url\":\"https://www.ycombinator.com\",\"final_url\":\"https://www.ycombinator.com\",\"title\":\"Y Combinator\",\"hash_sha256\":\"abc123...\",\"markdown\":\"# Content...\",\"success\":true}}\n\n"

Authorizations

Authorization

string

header

required

Bearer token authentication using your LLMLayer API key. Include in Authorization header as: Bearer YOUR_LLMLAYER_API_KEY

Body

application/json

url

string<uri>

required

Seed URL to start crawling from

Example:

"https://www.ycombinator.com"

max_pages

integer

default:25

Maximum number of pages to crawl (hard limit: 100)

Required range: 1 <= x <= 100

max_depth

integer

default:2

Maximum depth to crawl from seed URL

Required range: x >= 1

timeout

number | null

default:60

Total timeout in seconds for the entire crawl operation

include_subdomains

boolean

default:false

If true, includes pages from subdomains

include_links

boolean

default:true

Include hyperlinks in extracted content

include_images

boolean

default:true

Include images in extracted content

advanced_proxy

boolean | null

default:false

Enable advanced proxy for protected sites.

main_content_only

boolean | null

default:false

Extract only main page content.

formats

enum<string>[]

Accepted for compatibility; only markdown is currently honored by /api/v2/crawl_stream.

Available options:

markdown

Example:

["markdown"]

Response

Server-Sent Events stream of crawled pages

SSE stream with event types: page, usage, done, error

Map API - Discover URLs on a website YouTube Transcript API - Extract video transcripts