Crawl Any Website

Retrieve the raw HTML of any webpage via the Piloterr API and extract structured data without maintaining your own headless browser infrastructure.

Overview

This playbook shows how to fetch the full HTML of any webpage using the Piloterr Website Crawler, then extract specific data from it. A typical use case is competitor price monitoring: crawl a product page daily and parse the price from the HTML.

Prerequisites

A Piloterr API key — get one at app.piloterr.com
Install dependencies for your language:

pip install requests beautifulsoup4

npm install node-html-parser

curl and DOMDocument extensions (both enabled by default).

No extra dependencies for the request — uses net/http (Go 1.18+). Add golang.org/x/net/html for parsing.

No extra dependencies for the request — uses java.net.http (Java 11+). Add org.jsoup:jsoup for parsing.

No extra dependencies for the request — uses System.Net.Http (.NET 6+). Add HtmlAgilityPack for parsing.

# Cargo.toml
[dependencies]
reqwest = { version = "0.12", features = ["json"] }
tokio = { version = "1", features = ["full"] }
serde_json = "1"

Steps

Crawl a webpage

Call GET /v2/website/crawler with the query parameter set to the target URL. The response is the raw HTML string (JSON-encoded).

import requests

API_KEY = "YOUR_API_KEY"

response = requests.get(
    "https://api.piloterr.com/v2/website/crawler",
    headers={"x-api-key": API_KEY},
    params={"query": "https://example.com", "allow_redirects": "true"},
)

html = response.json()  # returns the HTML as a string
print(html[:500])

const API_KEY = "YOUR_API_KEY";

const params   = new URLSearchParams({ query: "https://example.com", allow_redirects: "true" });
const response = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
  headers: { "x-api-key": API_KEY },
});

const html: string = await response.json();
console.log(html.slice(0, 500));

<?php
$apiKey = "YOUR_API_KEY";
$params = http_build_query(["query" => "https://example.com", "allow_redirects" => "true"]);

$ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);

$html = json_decode(curl_exec($ch), true); // HTML string
curl_close($ch);

echo substr($html, 0, 500);

package main

import (
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "net/url"
)

func main() {
    params := url.Values{"query": {"https://example.com"}, "allow_redirects": {"true"}}
    req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
    req.Header.Set("x-api-key", "YOUR_API_KEY")

    resp, _ := http.DefaultClient.Do(req)
    defer resp.Body.Close()
    body, _ := io.ReadAll(resp.Body)

    var html string
    json.Unmarshal(body, &html) // response is a JSON-encoded string
    fmt.Println(html[:500])
}

import java.net.URI;
import java.net.http.*;

var client  = HttpClient.newHttpClient();
var request = HttpRequest.newBuilder()
    .uri(URI.create("https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true"))
    .header("x-api-key", "YOUR_API_KEY")
    .GET().build();

var response = client.send(request, HttpResponse.BodyHandlers.ofString());
// Response body is a JSON-encoded string — strip the outer quotes
var html = response.body().replaceAll("^\"|\"$", "")
    .replace("\\n", "\n").replace("\\\"", "\"");
System.out.println(html.substring(0, Math.min(500, html.length())));

using System.Net.Http;
using System.Text.Json;

using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");

var raw  = await client.GetStringAsync(
    "https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true");
var html = JsonSerializer.Deserialize<string>(raw)!; // response is JSON-encoded string
Console.WriteLine(html[..Math.Min(500, html.Length)]);

#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
    let html = reqwest::Client::new()
        .get("https://api.piloterr.com/v2/website/crawler")
        .header("x-api-key", "YOUR_API_KEY")
        .query(&[("query", "https://example.com"), ("allow_redirects", "true")])
        .send().await?
        .json::<String>().await?;

    println!("{}", &html[..500.min(html.len())]);
    Ok(())
}

Extract data from the HTML

Parse the HTML to extract specific elements — here we extract the page title and all <h1> headings.

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, "html.parser")

title = soup.find("title")
print("Page title:", title.text if title else "N/A")

for h in soup.find_all("h1"):
    print("H1:", h.get_text(strip=True))

import { parse } from "node-html-parser";

const root  = parse(html);
const title = root.querySelector("title");
console.log("Page title:", title?.text ?? "N/A");

for (const h of root.querySelectorAll("h1")) {
  console.log("H1:", h.text.trim());
}

$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$title = $xpath->query("//title")->item(0);
echo "Page title: " . ($title ? $title->textContent : "N/A") . "\n";

foreach ($xpath->query("//h1") as $h) {
    echo "H1: " . trim($h->textContent) . "\n";
}

import (
    "fmt"
    "strings"
    "golang.org/x/net/html"
)

doc, _ := html.Parse(strings.NewReader(html))
var traverse func(*html.Node)
traverse = func(n *html.Node) {
    if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
        fmt.Println("Page title:", n.FirstChild.Data)
    }
    if n.Type == html.ElementNode && n.Data == "h1" && n.FirstChild != nil {
        fmt.Println("H1:", n.FirstChild.Data)
    }
    for c := n.FirstChild; c != nil; c = c.NextSibling { traverse(c) }
}
traverse(doc)

import org.jsoup.Jsoup;

var doc   = Jsoup.parse(html);
System.out.println("Page title: " + doc.title());
doc.select("h1").forEach(h -> System.out.println("H1: " + h.text()));

using HtmlAgilityPack;

var doc = new HtmlDocument();
doc.LoadHtml(html);

var title = doc.DocumentNode.SelectSingleNode("//title");
Console.WriteLine($"Page title: {title?.InnerText ?? "N/A"}");

foreach (var h in doc.DocumentNode.SelectNodes("//h1") ?? Enumerable.Empty<HtmlNode>())
    Console.WriteLine($"H1: {h.InnerText.Trim()}");

// Minimal regex-based extraction
use regex::Regex; // add regex = "1" to Cargo.toml

let title_re = Regex::new(r"<title[^>]*>(.*?)</title>").unwrap();
if let Some(cap) = title_re.captures(&html) { println!("Page title: {}", &cap[1]); }

let h1_re = Regex::new(r"<h1[^>]*>(.*?)</h1>").unwrap();
for cap in h1_re.captures_iter(&html) { println!("H1: {}", &cap[1]); }

Build a price monitoring script

Crawl a product page and extract the price using a CSS selector.

import requests
from bs4 import BeautifulSoup

API_KEY   = "YOUR_API_KEY"
WATCH_URL = "https://www.example-shop.com/product/123"

def crawl(url: str) -> str:
    r = requests.get("https://api.piloterr.com/v2/website/crawler",
        headers={"x-api-key": API_KEY}, params={"query": url, "allow_redirects": "true"})
    return r.json()

def extract_price(html: str) -> str | None:
    soup = BeautifulSoup(html, "html.parser")
    el   = soup.select_one("[data-price], .price, #price")
    return el.get_text(strip=True) if el else None

price = extract_price(crawl(WATCH_URL))
print(f"Current price: {price}" if price else "Price element not found.")

import { parse } from "node-html-parser";

const API_KEY   = "YOUR_API_KEY";
const WATCH_URL = "https://www.example-shop.com/product/123";

async function crawl(url: string): Promise<string> {
  const params = new URLSearchParams({ query: url, allow_redirects: "true" });
  const res    = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
    headers: { "x-api-key": API_KEY },
  });
  return res.json();
}

const html  = await crawl(WATCH_URL);
const root  = parse(html);
const price = root.querySelector("[data-price], .price, #price")?.text.trim() ?? null;
console.log(price ? `Current price: ${price}` : "Price element not found.");

<?php
function crawl(string $apiKey, string $url): string {
    $params = http_build_query(["query" => $url, "allow_redirects" => "true"]);
    $ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);
    $html = json_decode(curl_exec($ch), true);
    curl_close($ch);
    return $html;
}

$html  = crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123");
$dom   = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$price = null;
foreach (["//span[@class='price']", "//*[@id='price']", "//*[@data-price]"] as $sel) {
    $nodes = $xpath->query($sel);
    if ($nodes->length > 0) { $price = trim($nodes->item(0)->textContent); break; }
}
echo $price ? "Current price: {$price}\n" : "Price element not found.\n";

package main

import (
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "net/url"
    "regexp"
)

func crawl(apiKey, pageUrl string) string {
    params := url.Values{"query": {pageUrl}, "allow_redirects": {"true"}}
    req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
    req.Header.Set("x-api-key", apiKey)
    resp, _ := http.DefaultClient.Do(req)
    defer resp.Body.Close()
    body, _ := io.ReadAll(resp.Body)
    var html string
    json.Unmarshal(body, &html)
    return html
}

func main() {
    html  := crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123")
    re    := regexp.MustCompile(`class="price[^"]*"[^>]*>([^<]+)`)
    match := re.FindStringSubmatch(html)
    if match != nil {
        fmt.Println("Current price:", match[1])
    } else {
        fmt.Println("Price element not found.")
    }
}

import java.net.URI;
import java.net.URLEncoder;
import java.net.http.*;
import java.nio.charset.StandardCharsets;
import org.jsoup.Jsoup;

public class Main {
    public static void main(String[] args) throws Exception {
        var apiKey   = "YOUR_API_KEY";
        var watchUrl = URLEncoder.encode("https://www.example-shop.com/product/123", StandardCharsets.UTF_8);
        var url      = "https://api.piloterr.com/v2/website/crawler?query=" + watchUrl + "&allow_redirects=true";

        var client   = HttpClient.newHttpClient();
        var request  = HttpRequest.newBuilder().uri(URI.create(url))
            .header("x-api-key", apiKey).GET().build();
        var response = client.send(request, HttpResponse.BodyHandlers.ofString());

        // Strip JSON string quotes
        var html     = response.body().replaceAll("^\"|\"$", "").replace("\\\"", "\"").replace("\\n", "\n");
        var doc      = Jsoup.parse(html);
        var priceEl  = doc.selectFirst(".price, #price, [data-price]");
        System.out.println(priceEl != null ? "Current price: " + priceEl.text() : "Price element not found.");
    }
}

using System.Net.Http;
using System.Text.Json;
using HtmlAgilityPack;

using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");

var watchUrl = Uri.EscapeDataString("https://www.example-shop.com/product/123");
var raw      = await client.GetStringAsync(
    $"https://api.piloterr.com/v2/website/crawler?query={watchUrl}&allow_redirects=true");
var html     = JsonSerializer.Deserialize<string>(raw)!;

var doc  = new HtmlDocument();
doc.LoadHtml(html);

var price = doc.DocumentNode.SelectSingleNode("//*[contains(@class,'price') or @id='price' or @data-price]");
Console.WriteLine(price != null ? $"Current price: {price.InnerText.Trim()}" : "Price element not found.");

use reqwest::Client;
use regex::Regex;

async fn crawl(client: &Client, api_key: &str, url: &str) -> String {
    client.get("https://api.piloterr.com/v2/website/crawler")
        .header("x-api-key", api_key)
        .query(&[("query", url), ("allow_redirects", "true")])
        .send().await.unwrap().json::<String>().await.unwrap()
}

#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
    let client = Client::new();
    let html   = crawl(&client, "YOUR_API_KEY", "https://www.example-shop.com/product/123").await;

    let re = Regex::new(r#"class="price[^"]*"[^>]*>([^<]+)"#).unwrap();
    match re.captures(&html) {
        Some(cap) => println!("Current price: {}", cap[1].trim()),
        None      => println!("Price element not found."),
    }
    Ok(())
}

Set allow_redirects=true to follow HTTP 301/302 redirects automatically — useful for short URLs or e-commerce platforms that redirect product pages.