⏱️6 min read · 1,135 words
Web scraping em 2026 requer a ferramenta certa para o trabalho: requests+BeautifulSoup para páginas estáticas, Playwright para sites com muito JavaScript, Scrapy para rastreamento em grande escala e técnicas de bypass anti-bot para sites protegidos. Este guia cobre todos eles.
📋 Table of Contents
A pilha de raspagem da Web em 2026
- pedidos + BeautifulSoup— simples e rápido para páginas HTML estáticas
- httpx— solicitações assíncronas, suporte HTTP/2
- Dramaturgo (Python)— automação completa do navegador, renderização JS
- Rasgado– estrutura spider de nível de produção, pipelines integrados
- Selênio– legado, use Playwright
- Rastejante— estrutura moderna baseada em Playwright com escalonamento automático
solicitações + BeautifulSoup: páginas estáticas
pip install requests beautifulsoup4 lxml
import requests
from bs4 import BeautifulSoup
# Basic scrape with headers (avoid 403 errors)
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
response = requests.get("https://news.ycombinator.com", headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
# Extract Hacker News titles
stories = []
for item in soup.select(".athing"):
title_tag = item.select_one(".titleline a")
score_tag = item.find_next_sibling().select_one(".score")
if title_tag:
stories.append({
"title": title_tag.get_text(strip=True),
"url": title_tag.get("href"),
"score": int(score_tag.get_text().split()[0]) if score_tag else 0
})
# Sort by score
stories.sort(key=lambda x: x["score"], reverse=True)
for story in stories[:10]:
print(f"{story['score']:4d} | {story['title'][:70]}")
Raspagem assíncrona com httpx
import asyncio
import httpx
from bs4 import BeautifulSoup
async def scrape_page(client: httpx.AsyncClient, url: str) -> dict:
response = await client.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
return {
"url": url,
"title": soup.find("h1").get_text(strip=True) if soup.find("h1") else "",
"text": soup.get_text(separator=" ", strip=True)[:500]
}
async def scrape_many(urls: list[str]) -> list[dict]:
headers = {"User-Agent": "Mozilla/5.0 compatible"}
limits = httpx.Limits(max_connections=10, max_keepalive_connections=5)
async with httpx.AsyncClient(headers=headers, limits=limits, timeout=15) as client:
tasks = [scrape_page(client, url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
return [r for r in results if not isinstance(r, Exception)]
urls = [
"https://techpulsesite.com/python-asyncio-guide-2026/",
"https://techpulsesite.com/docker-complete-guide-2026/",
]
data = asyncio.run(scrape_many(urls))
print(f"Scraped {len(data)} pages")
Dramaturgo: sites renderizados em JavaScript
pip install playwright
playwright install chromium
from playwright.async_api import async_playwright
import asyncio
async def scrape_spa(url: str) -> dict:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 compatible",
viewport={"width": 1280, "height": 720}
)
page = await context.new_page()
# Navigate and wait for content to load
await page.goto(url, wait_until="networkidle")
# Wait for specific element
await page.wait_for_selector(".product-list", timeout=10000)
# Extract data
products = []
items = await page.query_selector_all(".product-card")
for item in items:
title = await item.query_selector(".product-title")
price = await item.query_selector(".product-price")
products.append({
"name": await title.inner_text() if title else "",
"price": await price.inner_text() if price else ""
})
await browser.close()
return {"url": url, "products": products}
# Screenshot for debugging
async def screenshot_page(url: str):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url, wait_until="networkidle")
await page.screenshot(path="page.png", full_page=True)
await browser.close()
result = asyncio.run(scrape_spa("https://example-spa.com/products"))
print(f"Found {len(result['products'])} products")
Scrapy: rastreamento em grande escala
pip install scrapy
scrapy startproject techpulse_crawler
cd techpulse_crawler
scrapy genspider tech_spider techpulsesite.com
# spiders/tech_spider.py
import scrapy
class TechSpider(scrapy.Spider):
name = "tech_spider"
allowed_domains = ["techpulsesite.com"]
start_urls = ["https://techpulsesite.com/"]
custom_settings = {
"DOWNLOAD_DELAY": 1, # polite: 1s between requests
"CONCURRENT_REQUESTS": 8,
"ROBOTSTXT_OBEY": True,
"USER_AGENT": "TechBot/1.0 (+https://techpulsesite.com)",
}
def parse(self, response):
# Extract article links
for link in response.css("article a::attr(href)").getall():
yield response.follow(link, callback=self.parse_article)
# Follow pagination
next_page = response.css("a.next-page::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_article(self, response):
yield {
"url": response.url,
"title": response.css("h1::text").get("").strip(),
"author": response.css(".author-name::text").get(""),
"date": response.css("time::attr(datetime)").get(""),
"tags": response.css(".tag::text").getall(),
"content": " ".join(response.css("article p::text").getall()),
}
# Run spider
# scrapy crawl tech_spider -o output.json
Lidando com proteção anti-bot
import time, random
# Random delays between requests
time.sleep(random.uniform(1, 3))
# Rotate user agents
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
]
# Use sessions with cookies
session = requests.Session()
session.headers.update({"User-Agent": random.choice(USER_AGENTS)})
# For Cloudflare-protected sites: use Playwright (renders JS, handles challenges)
# Or commercial services: Bright Data, Oxylabs, ScraperAPI, ZenRows
# Playwright with stealth plugin
from playwright.async_api import async_playwright
async def scrape_protected(url: str):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False) # non-headless for better scores
context = await browser.new_context(
user_agent=random.choice(USER_AGENTS),
locale="en-US",
timezone_id="America/New_York",
)
page = await context.new_page()
# Mask automation signals
await page.add_init_script("delete Object.getPrototypeOf(navigator).webdriver")
await page.goto(url)
content = await page.content()
await browser.close()
return content
Armazenamento de dados
import json, csv
from pathlib import Path
# JSON
with open("data.json", "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# CSV
with open("data.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["title", "url", "price", "date"])
writer.writeheader()
writer.writerows(results)
# SQLite via Python built-in
import sqlite3
conn = sqlite3.connect("scraped.db")
conn.execute("CREATE TABLE IF NOT EXISTS articles (id INTEGER PRIMARY KEY, title TEXT, url TEXT UNIQUE, content TEXT)")
conn.executemany("INSERT OR IGNORE INTO articles (title, url, content) VALUES (?,?,?)",
[(r["title"], r["url"], r["content"]) for r in results])
conn.commit()
Diretrizes Legais e Éticas
- Verifique robots.txt— respeite as regras de rastreamento do site
- Taxa limite suas solicitações— 1-3 segundos entre solicitações no mínimo
- Não sobrecarregue os servidores— poucas conexões simultâneas
- Verifique os Termos de Serviço– alguns sites proíbem explicitamente a raspagem
- Não use dados copiados comercialmente– sem licenciamento adequado
- Use APIs oficiais quando disponíveis– primeira escolha, sempre
O web scraping em Python em 2026 cobre um amplo espectro. Use requests+BeautifulSoup para HTML simples, Playwright para páginas com muito JavaScript e Scrapy para rastreamento em escala de produção. Sempre respeite os sites que você acessa e prefira APIs oficiais, quando disponíveis.
🔗 Share this article
✍️ Leave a Comment