Explorer n'importe quel site web

Récupérez le HTML brut de n'importe quelle page web via l'API Piloterr et extrayez des données structurées sans maintenir votre propre infrastructure de navigateur headless.

Vue d'ensemble

Ce guide montre comment récupérer le HTML complet d'une page web avec le Piloterr Website Crawler, puis en extraire des données spécifiques. Un cas d'usage typique est la surveillance des prix concurrents : crawl quotidien d'une page produit et extraction du prix depuis le HTML.

Prérequis

Une clé API Piloterr — obtenez-en une sur app.piloterr.com
Installez les dépendances pour votre langage :

pip install requests beautifulsoup4

npm install node-html-parser

Extensions curl et DOMDocument (activées par défaut).

Aucune dépendance supplémentaire pour la requête — utilise net/http (Go 1.18+). Ajoutez golang.org/x/net/html pour l'analyse.

Aucune dépendance supplémentaire pour la requête — utilise java.net.http (Java 11+). Ajoutez org.jsoup:jsoup pour l'analyse.

Aucune dépendance supplémentaire pour la requête — utilise System.Net.Http (.NET 6+). Ajoutez HtmlAgilityPack pour l'analyse.

# Cargo.toml
[dependencies]
reqwest = { version = "0.12", features = ["json"] }
tokio = { version = "1", features = ["full"] }
serde_json = "1"

Étapes

Explorer une page web

Appelez GET /v2/website/crawler avec le paramètre query défini sur l'URL cible. La réponse est la chaîne HTML brute (encodée en JSON).

import requests

API_KEY = "YOUR_API_KEY"

response = requests.get(
    "https://api.piloterr.com/v2/website/crawler",
    headers={"x-api-key": API_KEY},
    params={"query": "https://example.com", "allow_redirects": "true"},
)

html = response.json()  # retourne le HTML sous forme de chaîne
print(html[:500])

const API_KEY = "YOUR_API_KEY";

const params   = new URLSearchParams({ query: "https://example.com", allow_redirects: "true" });
const response = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
  headers: { "x-api-key": API_KEY },
});

const html: string = await response.json();
console.log(html.slice(0, 500));

<?php
$apiKey = "YOUR_API_KEY";
$params = http_build_query(["query" => "https://example.com", "allow_redirects" => "true"]);

$ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);

$html = json_decode(curl_exec($ch), true); // chaîne HTML
curl_close($ch);

echo substr($html, 0, 500);

package main

import (
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "net/url"
)

func main() {
    params := url.Values{"query": {"https://example.com"}, "allow_redirects": {"true"}}
    req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
    req.Header.Set("x-api-key", "YOUR_API_KEY")

    resp, _ := http.DefaultClient.Do(req)
    defer resp.Body.Close()
    body, _ := io.ReadAll(resp.Body)

    var html string
    json.Unmarshal(body, &html) // la réponse est une chaîne encodée en JSON
    fmt.Println(html[:500])
}

import java.net.URI;
import java.net.http.*;

var client  = HttpClient.newHttpClient();
var request = HttpRequest.newBuilder()
    .uri(URI.create("https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true"))
    .header("x-api-key", "YOUR_API_KEY")
    .GET().build();

var response = client.send(request, HttpResponse.BodyHandlers.ofString());
// Le corps est une chaîne JSON — supprimer les guillemets extérieurs
var html = response.body().replaceAll("^\"|\"$", "")
    .replace("\\n", "\n").replace("\\\"", "\"");
System.out.println(html.substring(0, Math.min(500, html.length())));

using System.Net.Http;
using System.Text.Json;

using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");

var raw  = await client.GetStringAsync(
    "https://api.piloterr.com/v2/website/crawler?query=https%3A%2F%2Fexample.com&allow_redirects=true");
var html = JsonSerializer.Deserialize<string>(raw)!; // la réponse est une chaîne JSON
Console.WriteLine(html[..Math.Min(500, html.Length)]);

#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
    let html = reqwest::Client::new()
        .get("https://api.piloterr.com/v2/website/crawler")
        .header("x-api-key", "YOUR_API_KEY")
        .query(&[("query", "https://example.com"), ("allow_redirects", "true")])
        .send().await?
        .json::<String>().await?;

    println!("{}", &html[..500.min(html.len())]);
    Ok(())
}

Extraire des données du HTML

Analysez le HTML pour extraire des éléments spécifiques — ici on extrait le titre de la page et tous les titres <h1>.

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, "html.parser")

title = soup.find("title")
print("Titre de la page:", title.text if title else "N/A")

for h in soup.find_all("h1"):
    print("H1:", h.get_text(strip=True))

import { parse } from "node-html-parser";

const root  = parse(html);
const title = root.querySelector("title");
console.log("Titre de la page:", title?.text ?? "N/A");

for (const h of root.querySelectorAll("h1")) {
  console.log("H1:", h.text.trim());
}

$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$title = $xpath->query("//title")->item(0);
echo "Titre de la page: " . ($title ? $title->textContent : "N/A") . "\n";

foreach ($xpath->query("//h1") as $h) {
    echo "H1: " . trim($h->textContent) . "\n";
}

import (
    "fmt"
    "strings"
    "golang.org/x/net/html"
)

doc, _ := html.Parse(strings.NewReader(html))
var traverse func(*html.Node)
traverse = func(n *html.Node) {
    if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
        fmt.Println("Titre de la page:", n.FirstChild.Data)
    }
    if n.Type == html.ElementNode && n.Data == "h1" && n.FirstChild != nil {
        fmt.Println("H1:", n.FirstChild.Data)
    }
    for c := n.FirstChild; c != nil; c = c.NextSibling { traverse(c) }
}
traverse(doc)

import org.jsoup.Jsoup;

var doc   = Jsoup.parse(html);
System.out.println("Titre de la page: " + doc.title());
doc.select("h1").forEach(h -> System.out.println("H1: " + h.text()));

using HtmlAgilityPack;

var doc = new HtmlDocument();
doc.LoadHtml(html);

var title = doc.DocumentNode.SelectSingleNode("//title");
Console.WriteLine($"Titre de la page: {title?.InnerText ?? "N/A"}");

foreach (var h in doc.DocumentNode.SelectNodes("//h1") ?? Enumerable.Empty<HtmlNode>())
    Console.WriteLine($"H1: {h.InnerText.Trim()}");

// Extraction minimale par regex
use regex::Regex; // ajouter regex = "1" dans Cargo.toml

let title_re = Regex::new(r"<title[^>]*>(.*?)</title>").unwrap();
if let Some(cap) = title_re.captures(&html) { println!("Titre de la page: {}", &cap[1]); }

let h1_re = Regex::new(r"<h1[^>]*>(.*?)</h1>").unwrap();
for cap in h1_re.captures_iter(&html) { println!("H1: {}", &cap[1]); }

Construire un script de surveillance des prix

Crawlez une page produit et extrayez le prix avec un sélecteur CSS.

import requests
from bs4 import BeautifulSoup

API_KEY   = "YOUR_API_KEY"
WATCH_URL = "https://www.example-shop.com/product/123"

def crawl(url: str) -> str:
    r = requests.get("https://api.piloterr.com/v2/website/crawler",
        headers={"x-api-key": API_KEY}, params={"query": url, "allow_redirects": "true"})
    return r.json()

def extract_price(html: str) -> str | None:
    soup = BeautifulSoup(html, "html.parser")
    el   = soup.select_one("[data-price], .price, #price")
    return el.get_text(strip=True) if el else None

price = extract_price(crawl(WATCH_URL))
print(f"Prix actuel : {price}" if price else "Élément prix introuvable.")

import { parse } from "node-html-parser";

const API_KEY   = "YOUR_API_KEY";
const WATCH_URL = "https://www.example-shop.com/product/123";

async function crawl(url: string): Promise<string> {
  const params = new URLSearchParams({ query: url, allow_redirects: "true" });
  const res    = await fetch(`https://api.piloterr.com/v2/website/crawler?${params}`, {
    headers: { "x-api-key": API_KEY },
  });
  return res.json();
}

const html  = await crawl(WATCH_URL);
const root  = parse(html);
const price = root.querySelector("[data-price], .price, #price")?.text.trim() ?? null;
console.log(price ? `Prix actuel : ${price}` : "Élément prix introuvable.");

<?php
function crawl(string $apiKey, string $url): string {
    $params = http_build_query(["query" => $url, "allow_redirects" => "true"]);
    $ch = curl_init("https://api.piloterr.com/v2/website/crawler?{$params}");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);
    $html = json_decode(curl_exec($ch), true);
    curl_close($ch);
    return $html;
}

$html  = crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123");
$dom   = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$price = null;
foreach (["//span[@class='price']", "//*[@id='price']", "//*[@data-price]"] as $sel) {
    $nodes = $xpath->query($sel);
    if ($nodes->length > 0) { $price = trim($nodes->item(0)->textContent); break; }
}
echo $price ? "Prix actuel : {$price}\n" : "Élément prix introuvable.\n";

package main

import (
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "net/url"
    "regexp"
)

func crawl(apiKey, pageUrl string) string {
    params := url.Values{"query": {pageUrl}, "allow_redirects": {"true"}}
    req, _ := http.NewRequest("GET", "https://api.piloterr.com/v2/website/crawler?"+params.Encode(), nil)
    req.Header.Set("x-api-key", apiKey)
    resp, _ := http.DefaultClient.Do(req)
    defer resp.Body.Close()
    body, _ := io.ReadAll(resp.Body)
    var html string
    json.Unmarshal(body, &html)
    return html
}

func main() {
    html  := crawl("YOUR_API_KEY", "https://www.example-shop.com/product/123")
    re    := regexp.MustCompile(`class="price[^"]*"[^>]*>([^<]+)`)
    match := re.FindStringSubmatch(html)
    if match != nil {
        fmt.Println("Prix actuel :", match[1])
    } else {
        fmt.Println("Élément prix introuvable.")
    }
}

import java.net.URI;
import java.net.URLEncoder;
import java.net.http.*;
import java.nio.charset.StandardCharsets;
import org.jsoup.Jsoup;

public class Main {
    public static void main(String[] args) throws Exception {
        var apiKey   = "YOUR_API_KEY";
        var watchUrl = URLEncoder.encode("https://www.example-shop.com/product/123", StandardCharsets.UTF_8);
        var url      = "https://api.piloterr.com/v2/website/crawler?query=" + watchUrl + "&allow_redirects=true";

        var client   = HttpClient.newHttpClient();
        var request  = HttpRequest.newBuilder().uri(URI.create(url))
            .header("x-api-key", apiKey).GET().build();
        var response = client.send(request, HttpResponse.BodyHandlers.ofString());

        var html     = response.body().replaceAll("^\"|\"$", "").replace("\\\"", "\"").replace("\\n", "\n");
        var doc      = Jsoup.parse(html);
        var priceEl  = doc.selectFirst(".price, #price, [data-price]");
        System.out.println(priceEl != null ? "Prix actuel : " + priceEl.text() : "Élément prix introuvable.");
    }
}

using System.Net.Http;
using System.Text.Json;
using HtmlAgilityPack;

using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");

var watchUrl = Uri.EscapeDataString("https://www.example-shop.com/product/123");
var raw      = await client.GetStringAsync(
    $"https://api.piloterr.com/v2/website/crawler?query={watchUrl}&allow_redirects=true");
var html     = JsonSerializer.Deserialize<string>(raw)!;

var doc  = new HtmlDocument();
doc.LoadHtml(html);

var price = doc.DocumentNode.SelectSingleNode("//*[contains(@class,'price') or @id='price' or @data-price]");
Console.WriteLine(price != null ? $"Prix actuel : {price.InnerText.Trim()}" : "Élément prix introuvable.");

use reqwest::Client;
use regex::Regex;

async fn crawl(client: &Client, api_key: &str, url: &str) -> String {
    client.get("https://api.piloterr.com/v2/website/crawler")
        .header("x-api-key", api_key)
        .query(&[("query", url), ("allow_redirects", "true")])
        .send().await.unwrap().json::<String>().await.unwrap()
}

#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
    let client = Client::new();
    let html   = crawl(&client, "YOUR_API_KEY", "https://www.example-shop.com/product/123").await;

    let re = Regex::new(r#"class="price[^"]*"[^>]*>([^<]+)"#).unwrap();
    match re.captures(&html) {
        Some(cap) => println!("Prix actuel : {}", cap[1].trim()),
        None      => println!("Élément prix introuvable."),
    }
    Ok(())
}

Définissez allow_redirects=true pour suivre automatiquement les redirections HTTP 301/302 — utile pour les URLs courtes ou les plateformes e-commerce qui redirigent les pages produit.