Beliebige Website crawlen

Rufen Sie das rohe HTML einer beliebigen Webseite über die Piloterr API ab und extrahieren Sie strukturierte Daten — ohne eigene Headless-Browser-Infrastruktur.

Übersicht

Dieses Playbook zeigt, wie Sie das vollständige HTML einer Webseite mit dem Piloterr Website Crawler abrufen und dann spezifische Daten daraus extrahieren. Ein typischer Anwendungsfall ist die Konkurrenzpreisüberwachung: Crawlen Sie täglich eine Produktseite und parsen Sie den Preis aus dem HTML.

Voraussetzungen

Einen Piloterr-API-Schlüssel — erhalten Sie einen auf app.piloterr.com
Installieren Sie die Abhängigkeiten für Ihre Sprache:

pip install requests beautifulsoup4

npm install node-html-parser

Erweiterungen curl und DOMDocument (standardmäßig aktiviert).

Keine zusätzlichen Abhängigkeiten für die Anfrage — verwendet net/http (Go 1.18+). Fügen Sie golang.org/x/net/html zum Parsen hinzu.

Keine zusätzlichen Abhängigkeiten für die Anfrage — verwendet java.net.http (Java 11+). Fügen Sie org.jsoup:jsoup zum Parsen hinzu.

Keine zusätzlichen Abhängigkeiten für die Anfrage — verwendet System.Net.Http (.NET 6+). Fügen Sie HtmlAgilityPack zum Parsen hinzu.

# Cargo.toml
[dependencies]
reqwest = { version = "0.12", features = ["json"] }
tokio = { version = "1", features = ["full"] }
serde_json = "1"

Schritte

Webseite crawlen

Rufen Sie GET /v2/website/crawler mit dem Parameter query auf die Ziel-URL gesetzt auf. Die Antwort ist die rohe HTML-Zeichenkette (JSON-kodiert).

import requests

API_KEY = "YOUR_API_KEY"

response = requests.get(
    "https://api.piloterr.com/v2/website/crawler",
    headers={"x-api-key": API_KEY},
    params={"query": "https://example.com", "allow_redirects": "true"},
)

html = response.json()  # gibt das HTML als Zeichenkette zurück
print(html[:500])

const API_KEY = "YOUR_API_KEY";

const params   = new URLSearchParams({ query: "https://example.com", allow_redirects: "true" });
const response = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
  headers: { "x-api-key": API_KEY },
});

const html: string = await response.json();
console.log(html.slice(0, 500));

<?php
$apiKey = "YOUR_API_KEY";
$params = http_build_query(["query" => "https://example.com", "allow_redirects" => "true"]);

$ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);

$html = json_decode(curl_exec($ch), true); // HTML-Zeichenkette
curl_close($ch);

echo substr($html, 0, 500);

package main

import (
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "net/url"
)

func main() {
    params := url.Values{"query": {"https://example.com"}, "allow_redirects": {"true"}}
    req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
    req.Header.Set("x-api-key", "YOUR_API_KEY")

    resp, _ := http.DefaultClient.Do(req)
    defer resp.Body.Close()
    body, _ := io.ReadAll(resp.Body)

    var html string
    json.Unmarshal(body, &html) // Antwort ist eine JSON-kodierte Zeichenkette
    fmt.Println(html[:500])
}

import java.net.URI;
import java.net.http.*;

var client  = HttpClient.newHttpClient();
var request = HttpRequest.newBuilder()
    .uri(URI.create("https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true"))
    .header("x-api-key", "YOUR_API_KEY")
    .GET().build();

var response = client.send(request, HttpResponse.BodyHandlers.ofString());
var html = response.body().replaceAll("^\"|\"$", "")
    .replace("\\n", "\n").replace("\\\"", "\"");
System.out.println(html.substring(0, Math.min(500, html.length())));

using System.Net.Http;
using System.Text.Json;

using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");

var raw  = await client.GetStringAsync(
    "https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true");
var html = JsonSerializer.Deserialize<string>(raw)!;
Console.WriteLine(html[..Math.Min(500, html.Length)]);

#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
    let html = reqwest::Client::new()
        .get("https://api.piloterr.com/v2/website/crawler")
        .header("x-api-key", "YOUR_API_KEY")
        .query(&[("query", "https://example.com"), ("allow_redirects", "true")])
        .send().await?
        .json::<String>().await?;

    println!("{}", &html[..500.min(html.len())]);
    Ok(())
}

Daten aus dem HTML extrahieren

Parsen Sie das HTML, um bestimmte Elemente zu extrahieren — hier extrahieren wir den Seitentitel und alle <h1>-Überschriften.

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, "html.parser")

title = soup.find("title")
print("Seitentitel:", title.text if title else "N/A")

for h in soup.find_all("h1"):
    print("H1:", h.get_text(strip=True))

import { parse } from "node-html-parser";

const root  = parse(html);
const title = root.querySelector("title");
console.log("Seitentitel:", title?.text ?? "N/A");

for (const h of root.querySelectorAll("h1")) {
  console.log("H1:", h.text.trim());
}

$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$title = $xpath->query("//title")->item(0);
echo "Seitentitel: " . ($title ? $title->textContent : "N/A") . "\n";

foreach ($xpath->query("//h1") as $h) {
    echo "H1: " . trim($h->textContent) . "\n";
}

import (
    "fmt"
    "strings"
    "golang.org/x/net/html"
)

doc, _ := html.Parse(strings.NewReader(html))
var traverse func(*html.Node)
traverse = func(n *html.Node) {
    if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
        fmt.Println("Seitentitel:", n.FirstChild.Data)
    }
    if n.Type == html.ElementNode && n.Data == "h1" && n.FirstChild != nil {
        fmt.Println("H1:", n.FirstChild.Data)
    }
    for c := n.FirstChild; c != nil; c = c.NextSibling { traverse(c) }
}
traverse(doc)

import org.jsoup.Jsoup;

var doc   = Jsoup.parse(html);
System.out.println("Seitentitel: " + doc.title());
doc.select("h1").forEach(h -> System.out.println("H1: " + h.text()));

using HtmlAgilityPack;

var doc = new HtmlDocument();
doc.LoadHtml(html);

var title = doc.DocumentNode.SelectSingleNode("//title");
Console.WriteLine($"Seitentitel: {title?.InnerText ?? "N/A"}");

foreach (var h in doc.DocumentNode.SelectNodes("//h1") ?? Enumerable.Empty<HtmlNode>())
    Console.WriteLine($"H1: {h.InnerText.Trim()}");

// Minimale Extraktion per Regex
use regex::Regex; // regex = "1" zu Cargo.toml hinzufügen

let title_re = Regex::new(r"<title[^>]*>(.*?)</title>").unwrap();
if let Some(cap) = title_re.captures(&html) { println!("Seitentitel: {}", &cap[1]); }

let h1_re = Regex::new(r"<h1[^>]*>(.*?)</h1>").unwrap();
for cap in h1_re.captures_iter(&html) { println!("H1: {}", &cap[1]); }

Preisüberwachungs-Skript erstellen

Crawlen Sie eine Produktseite und extrahieren Sie den Preis mit einem CSS-Selektor.

import requests
from bs4 import BeautifulSoup

API_KEY   = "YOUR_API_KEY"
WATCH_URL = "https://www.example-shop.com/product/123"

def crawl(url: str) -> str:
    r = requests.get("https://api.piloterr.com/v2/website/crawler",
        headers={"x-api-key": API_KEY}, params={"query": url, "allow_redirects": "true"})
    return r.json()

def extract_price(html: str) -> str | None:
    soup = BeautifulSoup(html, "html.parser")
    el   = soup.select_one("[data-price], .price, #price")
    return el.get_text(strip=True) if el else None

price = extract_price(crawl(WATCH_URL))
print(f"Aktueller Preis: {price}" if price else "Preiselement nicht gefunden.")

import { parse } from "node-html-parser";

const API_KEY   = "YOUR_API_KEY";
const WATCH_URL = "https://www.example-shop.com/product/123";

async function crawl(url: string): Promise<string> {
  const params = new URLSearchParams({ query: url, allow_redirects: "true" });
  const res    = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
    headers: { "x-api-key": API_KEY },
  });
  return res.json();
}

const html  = await crawl(WATCH_URL);
const root  = parse(html);
const price = root.querySelector("[data-price], .price, #price")?.text.trim() ?? null;
console.log(price ? `Aktueller Preis: ${price}` : "Preiselement nicht gefunden.");

<?php
function crawl(string $apiKey, string $url): string {
    $params = http_build_query(["query" => $url, "allow_redirects" => "true"]);
    $ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);
    $html = json_decode(curl_exec($ch), true);
    curl_close($ch);
    return $html;
}

$html  = crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123");
$dom   = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$price = null;
foreach (["//span[@class='price']", "//*[@id='price']", "//*[@data-price]"] as $sel) {
    $nodes = $xpath->query($sel);
    if ($nodes->length > 0) { $price = trim($nodes->item(0)->textContent); break; }
}
echo $price ? "Aktueller Preis: {$price}\n" : "Preiselement nicht gefunden.\n";

package main

import (
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "net/url"
    "regexp"
)

func crawl(apiKey, pageUrl string) string {
    params := url.Values{"query": {pageUrl}, "allow_redirects": {"true"}}
    req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
    req.Header.Set("x-api-key", apiKey)
    resp, _ := http.DefaultClient.Do(req)
    defer resp.Body.Close()
    body, _ := io.ReadAll(resp.Body)
    var html string
    json.Unmarshal(body, &html)
    return html
}

func main() {
    html  := crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123")
    re    := regexp.MustCompile(`class="price[^"]*"[^>]*>([^<]+)`)
    match := re.FindStringSubmatch(html)
    if match != nil {
        fmt.Println("Aktueller Preis:", match[1])
    } else {
        fmt.Println("Preiselement nicht gefunden.")
    }
}

import java.net.URI;
import java.net.URLEncoder;
import java.net.http.*;
import java.nio.charset.StandardCharsets;
import org.jsoup.Jsoup;

public class Main {
    public static void main(String[] args) throws Exception {
        var apiKey   = "YOUR_API_KEY";
        var watchUrl = URLEncoder.encode("https://www.example-shop.com/product/123", StandardCharsets.UTF_8);
        var url      = "https://api.piloterr.com/v2/website/crawler?query=" + watchUrl + "&allow_redirects=true";

        var client   = HttpClient.newHttpClient();
        var request  = HttpRequest.newBuilder().uri(URI.create(url))
            .header("x-api-key", apiKey).GET().build();
        var response = client.send(request, HttpResponse.BodyHandlers.ofString());

        var html     = response.body().replaceAll("^\"|\"$", "").replace("\\\"", "\"").replace("\\n", "\n");
        var doc      = Jsoup.parse(html);
        var priceEl  = doc.selectFirst(".price, #price, [data-price]");
        System.out.println(priceEl != null ? "Aktueller Preis: " + priceEl.text() : "Preiselement nicht gefunden.");
    }
}

using System.Net.Http;
using System.Text.Json;
using HtmlAgilityPack;

using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");

var watchUrl = Uri.EscapeDataString("https://www.example-shop.com/product/123");
var raw      = await client.GetStringAsync(
    $"https://api.piloterr.com/v2/website/crawler?query={watchUrl}&allow_redirects=true");
var html     = JsonSerializer.Deserialize<string>(raw)!;

var doc  = new HtmlDocument();
doc.LoadHtml(html);

var price = doc.DocumentNode.SelectSingleNode("//*[contains(@class,'price') or @id='price' or @data-price]");
Console.WriteLine(price != null ? $"Aktueller Preis: {price.InnerText.Trim()}" : "Preiselement nicht gefunden.");

use reqwest::Client;
use regex::Regex;

async fn crawl(client: &Client, api_key: &str, url: &str) -> String {
    client.get("https://api.piloterr.com/v2/website/crawler")
        .header("x-api-key", api_key)
        .query(&[("query", url), ("allow_redirects", "true")])
        .send().await.unwrap().json::<String>().await.unwrap()
}

#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
    let client = Client::new();
    let html   = crawl(&client, "YOUR_API_KEY", "https://www.example-shop.com/product/123").await;

    let re = Regex::new(r#"class="price[^"]*"[^>]*>([^<]+)"#).unwrap();
    match re.captures(&html) {
        Some(cap) => println!("Aktueller Preis: {}", cap[1].trim()),
        None      => println!("Preiselement nicht gefunden."),
    }
    Ok(())
}

Setzen Sie allow_redirects=true, um HTTP 301/302-Weiterleitungen automatisch zu folgen — nützlich für Kurz-URLs oder E-Commerce-Plattformen, die Produktseiten umleiten.