Crawl Any Website
Retrieve the raw HTML of any webpage via the Piloterr API and extract structured data without maintaining your own headless browser infrastructure.
Overview
This playbook shows how to fetch the full HTML of any webpage using the Piloterr Website Crawler, then extract specific data from it. A typical use case is competitor price monitoring: crawl a product page daily and parse the price from the HTML.
Prerequisites
- A Piloterr API key — get one at app.piloterr.com
- Install dependencies for your language:
pip install requests beautifulsoup4npm install node-html-parsercurl and DOMDocument extensions (both enabled by default).
No extra dependencies for the request — uses net/http (Go 1.18+). Add golang.org/x/net/html for parsing.
No extra dependencies for the request — uses java.net.http (Java 11+). Add org.jsoup:jsoup for parsing.
No extra dependencies for the request — uses System.Net.Http (.NET 6+). Add HtmlAgilityPack for parsing.
# Cargo.toml
[dependencies]
reqwest = { version = "0.12", features = ["json"] }
tokio = { version = "1", features = ["full"] }
serde_json = "1"Steps
Crawl a webpage
Call GET /v2/website/crawler with the query parameter set to the target URL. The response is the raw HTML string (JSON-encoded).
import requests
API_KEY = "YOUR_API_KEY"
response = requests.get(
"https://api.piloterr.com/v2/website/crawler",
headers={"x-api-key": API_KEY},
params={"query": "https://example.com", "allow_redirects": "true"},
)
html = response.json() # returns the HTML as a string
print(html[:500])const API_KEY = "YOUR_API_KEY";
const params = new URLSearchParams({ query: "https://example.com", allow_redirects: "true" });
const response = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
headers: { "x-api-key": API_KEY },
});
const html: string = await response.json();
console.log(html.slice(0, 500));<?php
$apiKey = "YOUR_API_KEY";
$params = http_build_query(["query" => "https://example.com", "allow_redirects" => "true"]);
$ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);
$html = json_decode(curl_exec($ch), true); // HTML string
curl_close($ch);
echo substr($html, 0, 500);package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
)
func main() {
params := url.Values{"query": {"https://example.com"}, "allow_redirects": {"true"}}
req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
req.Header.Set("x-api-key", "YOUR_API_KEY")
resp, _ := http.DefaultClient.Do(req)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
var html string
json.Unmarshal(body, &html) // response is a JSON-encoded string
fmt.Println(html[:500])
}import java.net.URI;
import java.net.http.*;
var client = HttpClient.newHttpClient();
var request = HttpRequest.newBuilder()
.uri(URI.create("https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true"))
.header("x-api-key", "YOUR_API_KEY")
.GET().build();
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
// Response body is a JSON-encoded string — strip the outer quotes
var html = response.body().replaceAll("^\"|\"$", "")
.replace("\\n", "\n").replace("\\\"", "\"");
System.out.println(html.substring(0, Math.min(500, html.length())));using System.Net.Http;
using System.Text.Json;
using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");
var raw = await client.GetStringAsync(
"https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true");
var html = JsonSerializer.Deserialize<string>(raw)!; // response is JSON-encoded string
Console.WriteLine(html[..Math.Min(500, html.Length)]);#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
let html = reqwest::Client::new()
.get("https://api.piloterr.com/v2/website/crawler")
.header("x-api-key", "YOUR_API_KEY")
.query(&[("query", "https://example.com"), ("allow_redirects", "true")])
.send().await?
.json::<String>().await?;
println!("{}", &html[..500.min(html.len())]);
Ok(())
}Extract data from the HTML
Parse the HTML to extract specific elements — here we extract the page title and all <h1> headings.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title")
print("Page title:", title.text if title else "N/A")
for h in soup.find_all("h1"):
print("H1:", h.get_text(strip=True))import { parse } from "node-html-parser";
const root = parse(html);
const title = root.querySelector("title");
console.log("Page title:", title?.text ?? "N/A");
for (const h of root.querySelectorAll("h1")) {
console.log("H1:", h.text.trim());
}$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$title = $xpath->query("//title")->item(0);
echo "Page title: " . ($title ? $title->textContent : "N/A") . "\n";
foreach ($xpath->query("//h1") as $h) {
echo "H1: " . trim($h->textContent) . "\n";
}import (
"fmt"
"strings"
"golang.org/x/net/html"
)
doc, _ := html.Parse(strings.NewReader(html))
var traverse func(*html.Node)
traverse = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
fmt.Println("Page title:", n.FirstChild.Data)
}
if n.Type == html.ElementNode && n.Data == "h1" && n.FirstChild != nil {
fmt.Println("H1:", n.FirstChild.Data)
}
for c := n.FirstChild; c != nil; c = c.NextSibling { traverse(c) }
}
traverse(doc)import org.jsoup.Jsoup;
var doc = Jsoup.parse(html);
System.out.println("Page title: " + doc.title());
doc.select("h1").forEach(h -> System.out.println("H1: " + h.text()));using HtmlAgilityPack;
var doc = new HtmlDocument();
doc.LoadHtml(html);
var title = doc.DocumentNode.SelectSingleNode("//title");
Console.WriteLine($"Page title: {title?.InnerText ?? "N/A"}");
foreach (var h in doc.DocumentNode.SelectNodes("//h1") ?? Enumerable.Empty<HtmlNode>())
Console.WriteLine($"H1: {h.InnerText.Trim()}");// Minimal regex-based extraction
use regex::Regex; // add regex = "1" to Cargo.toml
let title_re = Regex::new(r"<title[^>]*>(.*?)</title>").unwrap();
if let Some(cap) = title_re.captures(&html) { println!("Page title: {}", &cap[1]); }
let h1_re = Regex::new(r"<h1[^>]*>(.*?)</h1>").unwrap();
for cap in h1_re.captures_iter(&html) { println!("H1: {}", &cap[1]); }Build a price monitoring script
Crawl a product page and extract the price using a CSS selector.
import requests
from bs4 import BeautifulSoup
API_KEY = "YOUR_API_KEY"
WATCH_URL = "https://www.example-shop.com/product/123"
def crawl(url: str) -> str:
r = requests.get("https://api.piloterr.com/v2/website/crawler",
headers={"x-api-key": API_KEY}, params={"query": url, "allow_redirects": "true"})
return r.json()
def extract_price(html: str) -> str | None:
soup = BeautifulSoup(html, "html.parser")
el = soup.select_one("[data-price], .price, #price")
return el.get_text(strip=True) if el else None
price = extract_price(crawl(WATCH_URL))
print(f"Current price: {price}" if price else "Price element not found.")import { parse } from "node-html-parser";
const API_KEY = "YOUR_API_KEY";
const WATCH_URL = "https://www.example-shop.com/product/123";
async function crawl(url: string): Promise<string> {
const params = new URLSearchParams({ query: url, allow_redirects: "true" });
const res = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
headers: { "x-api-key": API_KEY },
});
return res.json();
}
const html = await crawl(WATCH_URL);
const root = parse(html);
const price = root.querySelector("[data-price], .price, #price")?.text.trim() ?? null;
console.log(price ? `Current price: ${price}` : "Price element not found.");<?php
function crawl(string $apiKey, string $url): string {
$params = http_build_query(["query" => $url, "allow_redirects" => "true"]);
$ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);
$html = json_decode(curl_exec($ch), true);
curl_close($ch);
return $html;
}
$html = crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123");
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$price = null;
foreach (["//span[@class='price']", "//*[@id='price']", "//*[@data-price]"] as $sel) {
$nodes = $xpath->query($sel);
if ($nodes->length > 0) { $price = trim($nodes->item(0)->textContent); break; }
}
echo $price ? "Current price: {$price}\n" : "Price element not found.\n";package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
)
func crawl(apiKey, pageUrl string) string {
params := url.Values{"query": {pageUrl}, "allow_redirects": {"true"}}
req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
req.Header.Set("x-api-key", apiKey)
resp, _ := http.DefaultClient.Do(req)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
var html string
json.Unmarshal(body, &html)
return html
}
func main() {
html := crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123")
re := regexp.MustCompile(`class="price[^"]*"[^>]*>([^<]+)`)
match := re.FindStringSubmatch(html)
if match != nil {
fmt.Println("Current price:", match[1])
} else {
fmt.Println("Price element not found.")
}
}import java.net.URI;
import java.net.URLEncoder;
import java.net.http.*;
import java.nio.charset.StandardCharsets;
import org.jsoup.Jsoup;
public class Main {
public static void main(String[] args) throws Exception {
var apiKey = "YOUR_API_KEY";
var watchUrl = URLEncoder.encode("https://www.example-shop.com/product/123", StandardCharsets.UTF_8);
var url = "https://api.piloterr.com/v2/website/crawler?query=" + watchUrl + "&allow_redirects=true";
var client = HttpClient.newHttpClient();
var request = HttpRequest.newBuilder().uri(URI.create(url))
.header("x-api-key", apiKey).GET().build();
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
// Strip JSON string quotes
var html = response.body().replaceAll("^\"|\"$", "").replace("\\\"", "\"").replace("\\n", "\n");
var doc = Jsoup.parse(html);
var priceEl = doc.selectFirst(".price, #price, [data-price]");
System.out.println(priceEl != null ? "Current price: " + priceEl.text() : "Price element not found.");
}
}using System.Net.Http;
using System.Text.Json;
using HtmlAgilityPack;
using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");
var watchUrl = Uri.EscapeDataString("https://www.example-shop.com/product/123");
var raw = await client.GetStringAsync(
$"https://api.piloterr.com/v2/website/crawler?query={watchUrl}&allow_redirects=true");
var html = JsonSerializer.Deserialize<string>(raw)!;
var doc = new HtmlDocument();
doc.LoadHtml(html);
var price = doc.DocumentNode.SelectSingleNode("//*[contains(@class,'price') or @id='price' or @data-price]");
Console.WriteLine(price != null ? $"Current price: {price.InnerText.Trim()}" : "Price element not found.");use reqwest::Client;
use regex::Regex;
async fn crawl(client: &Client, api_key: &str, url: &str) -> String {
client.get("https://api.piloterr.com/v2/website/crawler")
.header("x-api-key", api_key)
.query(&[("query", url), ("allow_redirects", "true")])
.send().await.unwrap().json::<String>().await.unwrap()
}
#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
let client = Client::new();
let html = crawl(&client, "YOUR_API_KEY", "https://www.example-shop.com/product/123").await;
let re = Regex::new(r#"class="price[^"]*"[^>]*>([^<]+)"#).unwrap();
match re.captures(&html) {
Some(cap) => println!("Current price: {}", cap[1].trim()),
None => println!("Price element not found."),
}
Ok(())
}Set allow_redirects=true to follow HTTP 301/302 redirects automatically — useful for short URLs or e-commerce platforms that redirect product pages.