Extract Content as Markdown
Convert any webpage to clean Markdown text using the CaptureKit content API — strip ads and boilerplate to get the pure article or documentation content.
Overview
The CaptureKit content endpoint fetches a webpage, strips navigation, ads, and layout chrome, then returns the main body as clean Markdown. Use it for building RAG datasets, indexing documentation, or archiving articles.
Prerequisites
- A CaptureKit API key — get one at app.capturekit.dev
- Install dependencies for your language:
pip install requestsNo extra dependencies — uses the native fetch API (Node 18+).
curl extension enabled (on by default).
No extra dependencies — uses net/http (Go 1.18+).
No extra dependencies — uses java.net.http (Java 11+).
No extra dependencies — uses System.Net.Http (.NET 6+).
# Cargo.toml
[dependencies]
reqwest = { version = "0.12", features = ["json"] }
tokio = { version = "1", features = ["full"] }
serde_json = "1"Steps
Fetch the page content
Call GET /v1/content with the url parameter.
import requests
API_KEY = "YOUR_API_KEY"
response = requests.get(
"https://api.capturekit.dev/v1/content",
headers={"x-api-key": API_KEY},
params={"url": "https://stripe.com/docs/payments"},
)
data = response.json()
print(data)const API_KEY = "YOUR_API_KEY";
const params = new URLSearchParams({ url: "https://stripe.com/docs/payments" });
const response = await fetch(`https://api.capturekit.dev/v1/content?${params}`, {
headers: { "x-api-key": API_KEY },
});
const data = await response.json();
console.log(data);<?php
$apiKey = "YOUR_API_KEY";
$params = http_build_query(["url" => "https://stripe.com/docs/payments"]);
$ch = curl_init("https://api.capturekit.dev/v1/content?{$params}");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);
$data = json_decode(curl_exec($ch), true);
curl_close($ch);
print_r($data);package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
)
func main() {
params := url.Values{"url": {"https://stripe.com/docs/payments"}}
req, _ := http.NewRequest("GET", "https://api.capturekit.dev/v1/content?"+params.Encode(), nil)
req.Header.Set("x-api-key", "YOUR_API_KEY")
resp, _ := http.DefaultClient.Do(req)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
var data map[string]any
json.Unmarshal(body, &data)
fmt.Println(data)
}import java.net.URI;
import java.net.URLEncoder;
import java.net.http.*;
import java.nio.charset.StandardCharsets;
var client = HttpClient.newHttpClient();
var target = URLEncoder.encode("https://stripe.com/docs/payments", StandardCharsets.UTF_8);
var request = HttpRequest.newBuilder()
.uri(URI.create("https://api.capturekit.dev/v1/content?url=" + target))
.header("x-api-key", "YOUR_API_KEY").GET().build();
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());using System.Net.Http;
using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", "YOUR_API_KEY");
var target = Uri.EscapeDataString("https://stripe.com/docs/payments");
var body = await client.GetStringAsync($"https://api.capturekit.dev/v1/content?url={target}");
Console.WriteLine(body);#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
let data = reqwest::Client::new()
.get("https://api.capturekit.dev/v1/content")
.header("x-api-key", "YOUR_API_KEY")
.query(&[("url", "https://stripe.com/docs/payments")])
.send().await?.json::<serde_json::Value>().await?;
println!("{:#?}", data);
Ok(())
}Access the Markdown content
The response includes markdown, title, description, author, published_at, and word_count.
print(f"Title : {data.get('title')}")
print(f"Author : {data.get('author', 'N/A')}")
print(f"Word count : {data.get('word_count')} words")
print()
print("--- Markdown preview ---")
markdown = data.get("markdown", "")
print(markdown[:1000])console.log(`Title : ${data.title}`);
console.log(`Author : ${data.author ?? "N/A"}`);
console.log(`Word count : ${data.word_count} words\n`);
console.log("--- Markdown preview ---");
console.log(data.markdown?.slice(0, 1000));echo "Title : {$data['title']}\n";
echo "Author : " . ($data["author"] ?? "N/A") . "\n";
echo "Word count : {$data['word_count']} words\n\n";
echo "--- Markdown preview ---\n";
echo substr($data["markdown"] ?? "", 0, 1000) . "\n";fmt.Printf("Title : %v\nAuthor : %v\nWord count : %v words\n\n",
data["title"], data["author"], data["word_count"])
markdown := data["markdown"].(string)
if len(markdown) > 1000 { markdown = markdown[:1000] }
fmt.Println("--- Markdown preview ---\n" + markdown)import org.json.*;
var d = new JSONObject(response.body());
var markdown = d.getString("markdown");
System.out.printf("Title : %s%nAuthor : %s%nWord count : %d words%n%n",
d.getString("title"), d.optString("author", "N/A"), d.getInt("word_count"));
System.out.println("--- Markdown preview ---");
System.out.println(markdown.substring(0, Math.min(1000, markdown.length())));using System.Text.Json;
var d = JsonDocument.Parse(body).RootElement;
var markdown = d.GetProperty("markdown").GetString() ?? "";
Console.WriteLine($"Title : {d.GetProperty("title")}");
Console.WriteLine($"Author : {(d.TryGetProperty("author", out var a) ? a : (object)"N/A")}");
Console.WriteLine($"Word count : {d.GetProperty("word_count")} words\n");
Console.WriteLine("--- Markdown preview ---");
Console.WriteLine(markdown[..Math.Min(1000, markdown.Length)]);let markdown = data["markdown"].as_str().unwrap_or("");
println!("Title : {}", data["title"].as_str().unwrap_or(""));
println!("Author : {}", data["author"].as_str().unwrap_or("N/A"));
println!("Word count : {} words\n", data["word_count"]);
println!("--- Markdown preview ---");
println!("{}", &markdown[..1000.min(markdown.len())]);Crawl and archive a list of documentation pages
Fetch and save multiple pages as Markdown files for offline search or RAG ingestion.
import os, time, requests
API_KEY = "YOUR_API_KEY"
os.makedirs("docs_archive", exist_ok=True)
PAGES = [
"https://stripe.com/docs/payments",
"https://stripe.com/docs/billing",
"https://stripe.com/docs/connect",
]
for page_url in PAGES:
r = requests.get("https://api.capturekit.dev/v1/content",
headers={"x-api-key": API_KEY}, params={"url": page_url})
data = r.json()
slug = page_url.rstrip("/").split("/")[-1]
path = f"docs_archive/{slug}.md"
with open(path, "w") as f:
f.write(f"# {data.get('title', slug)}\n\n")
f.write(data.get("markdown", ""))
print(f"Saved: {path} ({data.get('word_count', 0)} words)")
time.sleep(1)
print("Done!")import { mkdirSync, writeFileSync } from "fs";
const API_KEY = "YOUR_API_KEY";
mkdirSync("docs_archive", { recursive: true });
const PAGES = [
"https://stripe.com/docs/payments",
"https://stripe.com/docs/billing",
"https://stripe.com/docs/connect",
];
for (const pageUrl of PAGES) {
const params = new URLSearchParams({ url: pageUrl });
const res = await fetch(`https://api.capturekit.dev/v1/content?${params}`, {
headers: { "x-api-key": API_KEY },
});
const data = await res.json();
const slug = pageUrl.replace(/\/$/, "").split("/").at(-1)!;
writeFileSync(`docs_archive/${slug}.md`, `# ${data.title ?? slug}\n\n${data.markdown ?? ""}`);
console.log(`Saved: docs_archive/${slug}.md (${data.word_count ?? 0} words)`);
await new Promise(r => setTimeout(r, 1000));
}
console.log("Done!");<?php
$apiKey = "YOUR_API_KEY";
@mkdir("docs_archive");
$pages = [
"https://stripe.com/docs/payments",
"https://stripe.com/docs/billing",
"https://stripe.com/docs/connect",
];
foreach ($pages as $pageUrl) {
$params = http_build_query(["url" => $pageUrl]);
$ch = curl_init("https://api.capturekit.dev/v1/content?{$params}");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ["x-api-key: {$apiKey}"]);
$data = json_decode(curl_exec($ch), true);
curl_close($ch);
$slug = basename(rtrim($pageUrl, "/"));
$path = "docs_archive/{$slug}.md";
file_put_contents($path, "# {$data['title']}\n\n{$data['markdown']}");
echo "Saved: {$path} ({$data['word_count']} words)\n";
sleep(1);
}
echo "Done!\n";package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
func main() {
os.MkdirAll("docs_archive", 0755)
apiKey := "YOUR_API_KEY"
pages := []string{
"https://stripe.com/docs/payments",
"https://stripe.com/docs/billing",
"https://stripe.com/docs/connect",
}
for _, pageURL := range pages {
params := url.Values{"url": {pageURL}}
req, _ := http.NewRequest("GET", "https://api.capturekit.dev/v1/content?"+params.Encode(), nil)
req.Header.Set("x-api-key", apiKey)
resp, _ := http.DefaultClient.Do(req)
body, _ := io.ReadAll(resp.Body)
resp.Body.Close()
var data map[string]any
json.Unmarshal(body, &data)
parts := strings.Split(strings.TrimRight(pageURL, "/"), "/")
slug := parts[len(parts)-1]
path := filepath.Join("docs_archive", slug+".md")
content := fmt.Sprintf("# %v\n\n%v", data["title"], data["markdown"])
os.WriteFile(path, []byte(content), 0644)
fmt.Printf("Saved: %s (%.0f words)\n", path, data["word_count"])
time.Sleep(time.Second)
}
fmt.Println("Done!")
}import java.net.URI;
import java.net.URLEncoder;
import java.net.http.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.util.List;
import org.json.*;
public class Main {
public static void main(String[] args) throws Exception {
var client = HttpClient.newHttpClient();
var apiKey = "YOUR_API_KEY";
Files.createDirectories(Path.of("docs_archive"));
var pages = List.of(
"https://stripe.com/docs/payments",
"https://stripe.com/docs/billing",
"https://stripe.com/docs/connect");
for (var pageUrl : pages) {
var encoded = URLEncoder.encode(pageUrl, StandardCharsets.UTF_8);
var req = HttpRequest.newBuilder()
.uri(URI.create("https://api.capturekit.dev/v1/content?url=" + encoded))
.header("x-api-key", apiKey).GET().build();
var resp = client.send(req, HttpResponse.BodyHandlers.ofString());
var data = new JSONObject(resp.body());
var slug = pageUrl.replaceAll("/$","").replaceAll(".*/","");
var content = "# " + data.getString("title") + "\n\n" + data.getString("markdown");
Files.writeString(Path.of("docs_archive/" + slug + ".md"), content);
System.out.printf("Saved: docs_archive/%s.md (%d words)%n", slug, data.getInt("word_count"));
Thread.sleep(1000);
}
System.out.println("Done!");
}
}using System.Net.Http;
using System.Text.Json;
var apiKey = "YOUR_API_KEY";
using var client = new HttpClient();
client.DefaultRequestHeaders.Add("x-api-key", apiKey);
Directory.CreateDirectory("docs_archive");
var pages = new[]
{
"https://stripe.com/docs/payments",
"https://stripe.com/docs/billing",
"https://stripe.com/docs/connect",
};
foreach (var pageUrl in pages)
{
var encoded = Uri.EscapeDataString(pageUrl);
var body = await client.GetStringAsync($"https://api.capturekit.dev/v1/content?url={encoded}");
var data = JsonDocument.Parse(body).RootElement;
var slug = pageUrl.TrimEnd('/').Split('/').Last();
var content = $"# {data.GetProperty("title")}\n\n{data.GetProperty("markdown")}";
File.WriteAllText($"docs_archive/{slug}.md", content);
Console.WriteLine($"Saved: docs_archive/{slug}.md ({data.GetProperty("word_count")} words)");
await Task.Delay(1000);
}
Console.WriteLine("Done!");use reqwest::Client;
use serde_json::Value;
use std::{fs, path::Path, time::Duration};
use tokio::time::sleep;
#[tokio::main]
async fn main() -> Result<(), reqwest::Error> {
let client = Client::new();
let api_key = "YOUR_API_KEY";
let pages = ["https://stripe.com/docs/payments", "https://stripe.com/docs/billing", "https://stripe.com/docs/connect"];
fs::create_dir_all("docs_archive").unwrap();
for page_url in &pages {
let data = client.get("https://api.capturekit.dev/v1/content")
.header("x-api-key", api_key)
.query(&[("url", page_url)])
.send().await?.json::<Value>().await?;
let slug = page_url.trim_end_matches('/').split('/').last().unwrap_or("page");
let path = format!("docs_archive/{}.md", slug);
let content = format!("# {}\n\n{}", data["title"].as_str().unwrap_or(""), data["markdown"].as_str().unwrap_or(""));
fs::write(&path, content).unwrap();
println!("Saved: {} ({} words)", path, data["word_count"]);
sleep(Duration::from_secs(1)).await;
}
println!("Done!");
Ok(())
}The extracted Markdown is ideal as context chunks for a RAG (Retrieval-Augmented Generation) pipeline. Chunk by headings and embed with your preferred vector store for semantic search over any website's documentation.