Documentation Index
Fetch the complete documentation index at: https://developers.scrapeunblocker.com/llms.txt
Use this file to discover all available pages before exploring further.
There is no Python SDK to install - the API is plain HTTP, so the requests library covers everything. Async examples use httpx.
Store your API key in an environment variable:
export SCRAPEUNBLOCKER_KEY="su_live_..."
Fetch page HTML
import os
import requests
response = requests.post(
"https://api.scrapeunblocker.com/getPageSource",
params={"url": "https://example.com"},
headers={"x-scrapeunblocker-key": os.environ["SCRAPEUNBLOCKER_KEY"]},
)
html = response.text
Get parsed JSON instead of HTML
The killer feature: pass parsed_data=True and the API extracts structured data using Schema.org, __NEXT_DATA__, or AI-generated rules - no per-site parsers to maintain.
import os
import requests
response = requests.post(
"https://api.scrapeunblocker.com/getPageSource",
params={
"url": "https://www.amazon.com/dp/B08N5WRWNW",
"parsed_data": True,
},
headers={"x-scrapeunblocker-key": os.environ["SCRAPEUNBLOCKER_KEY"]},
)
payload = response.json()
print(payload["data"]["page_type"]) # "product"
print(payload["data"]["data"]["title"])
print(payload["data"]["data"]["price"])
See the parsed data guide for response shapes.
Scrape a Google SERP
import os
import requests
response = requests.post(
"https://api.scrapeunblocker.com/serpApi",
params={
"keyword": "best running shoes",
"pages_to_check": 2,
},
headers={"x-scrapeunblocker-key": os.environ["SCRAPEUNBLOCKER_KEY"]},
)
serp = response.json()
for result in serp["organic"]:
print(result["position"], result["title"], result["url"])
Force a country
proxy_country works on all three endpoints.
import os
import requests
response = requests.post(
"https://api.scrapeunblocker.com/getPageSource",
params={
"url": "https://www.amazon.de/dp/B08N5WRWNW",
"parsed_data": True,
"proxy_country": "de",
},
headers={"x-scrapeunblocker-key": os.environ["SCRAPEUNBLOCKER_KEY"]},
)
Capture cookies and the proxy used
import os
import requests
response = requests.post(
"https://api.scrapeunblocker.com/getPageSource",
params={
"url": "https://example.com",
"get_cookies": True,
},
headers={"x-scrapeunblocker-key": os.environ["SCRAPEUNBLOCKER_KEY"]},
)
data = response.json()
html = data["html"]
cookies = {c["name"]: c["value"] for c in data["cookies"]}
served_by = data["proxy"] # e.g. "us"
Fetch an image as PNG bytes
import os
import requests
response = requests.post(
"https://api.scrapeunblocker.com/getImage",
params={"url": "https://example.com/photo.jpg"},
headers={"x-scrapeunblocker-key": os.environ["SCRAPEUNBLOCKER_KEY"]},
)
with open("photo.png", "wb") as f:
f.write(response.content)
Async with httpx
For high-throughput crawls, httpx.AsyncClient runs many requests concurrently. Cap concurrency to your plan’s limit with a semaphore.
import os
import asyncio
import httpx
KEY = os.environ["SCRAPEUNBLOCKER_KEY"]
CONCURRENCY = 10
async def scrape(client, sem, url):
async with sem:
r = await client.post(
"https://api.scrapeunblocker.com/getPageSource",
params={"url": url, "parsed_data": True},
)
return r.json()
async def main(urls):
sem = asyncio.Semaphore(CONCURRENCY)
async with httpx.AsyncClient(
headers={"x-scrapeunblocker-key": KEY},
timeout=180,
) as client:
return await asyncio.gather(*(scrape(client, sem, u) for u in urls))
results = asyncio.run(main([
"https://www.amazon.com/dp/B08N5WRWNW",
"https://www.amazon.com/dp/B07FZ8S74R",
]))
Retries that handle every failure mode
403 rotates country once. 408, 503, 504 retry with exponential backoff. 401, 422, and other terminal codes fail fast.
import os
import time
import requests
KEY = os.environ["SCRAPEUNBLOCKER_KEY"]
RETRYABLE = {408, 503, 504}
def fetch(url, **params):
rotated = False
for attempt in range(4):
r = requests.post(
"https://api.scrapeunblocker.com/getPageSource",
params={"url": url, **params},
headers={"x-scrapeunblocker-key": KEY},
)
if r.status_code == 200:
return r
if r.status_code == 403 and not rotated:
params = {**params, "proxy_country": "us"}
rotated = True
continue
if r.status_code in RETRYABLE:
time.sleep(2 ** attempt)
continue
r.raise_for_status()
r.raise_for_status()
return r
response = fetch("https://example.com", parsed_data=True)
See handling failures for what each status code means.