Python Web Scraping 2026: Guia Playwright, Scrapy, BeautifulSoup

⏱️6 min read · 1,135 words

Web scraping em 2026 requer a ferramenta certa para o trabalho: requests+BeautifulSoup para páginas estáticas, Playwright para sites com muito JavaScript, Scrapy para rastreamento em grande escala e técnicas de bypass anti-bot para sites protegidos. Este guia cobre todos eles.

📋 Table of Contents

A pilha de raspagem da Web em 2026
solicitações + BeautifulSoup: páginas estáticas
Raspagem assíncrona com httpx
Dramaturgo: sites renderizados em JavaScript
Scrapy: rastreamento em grande escala
Lidando com proteção anti-bot
Armazenamento de dados
Diretrizes Legais e Éticas

A pilha de raspagem da Web em 2026

pedidos + BeautifulSoup— simples e rápido para páginas HTML estáticas
httpx— solicitações assíncronas, suporte HTTP/2
Dramaturgo (Python)— automação completa do navegador, renderização JS
Rasgado– estrutura spider de nível de produção, pipelines integrados
Selênio– legado, use Playwright
Rastejante— estrutura moderna baseada em Playwright com escalonamento automático

solicitações + BeautifulSoup: páginas estáticas

pip install requests beautifulsoup4 lxml

import requests
from bs4 import BeautifulSoup

# Basic scrape with headers (avoid 403 errors)
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

response = requests.get("https://news.ycombinator.com", headers=headers, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.text, "lxml")

# Extract Hacker News titles
stories = []
for item in soup.select(".athing"):
    title_tag = item.select_one(".titleline a")
    score_tag = item.find_next_sibling().select_one(".score")
    if title_tag:
        stories.append({
            "title": title_tag.get_text(strip=True),
            "url": title_tag.get("href"),
            "score": int(score_tag.get_text().split()[0]) if score_tag else 0
        })

# Sort by score
stories.sort(key=lambda x: x["score"], reverse=True)
for story in stories[:10]:
    print(f"{story['score']:4d} | {story['title'][:70]}")

Raspagem assíncrona com httpx

import asyncio
import httpx
from bs4 import BeautifulSoup

async def scrape_page(client: httpx.AsyncClient, url: str) -> dict:
    response = await client.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "lxml")
    return {
        "url": url,
        "title": soup.find("h1").get_text(strip=True) if soup.find("h1") else "",
        "text": soup.get_text(separator=" ", strip=True)[:500]
    }

async def scrape_many(urls: list[str]) -> list[dict]:
    headers = {"User-Agent": "Mozilla/5.0 compatible"}
    limits = httpx.Limits(max_connections=10, max_keepalive_connections=5)

    async with httpx.AsyncClient(headers=headers, limits=limits, timeout=15) as client:
        tasks = [scrape_page(client, url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)

    return [r for r in results if not isinstance(r, Exception)]

urls = [
    "https://techpulsesite.com/python-asyncio-guide-2026/",
    "https://techpulsesite.com/docker-complete-guide-2026/",
]
data = asyncio.run(scrape_many(urls))
print(f"Scraped {len(data)} pages")

Dramaturgo: sites renderizados em JavaScript

pip install playwright
playwright install chromium

from playwright.async_api import async_playwright
import asyncio

async def scrape_spa(url: str) -> dict:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 compatible",
            viewport={"width": 1280, "height": 720}
        )
        page = await context.new_page()

        # Navigate and wait for content to load
        await page.goto(url, wait_until="networkidle")

        # Wait for specific element
        await page.wait_for_selector(".product-list", timeout=10000)

        # Extract data
        products = []
        items = await page.query_selector_all(".product-card")
        for item in items:
            title = await item.query_selector(".product-title")
            price = await item.query_selector(".product-price")
            products.append({
                "name": await title.inner_text() if title else "",
                "price": await price.inner_text() if price else ""
            })

        await browser.close()
        return {"url": url, "products": products}

# Screenshot for debugging
async def screenshot_page(url: str):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="networkidle")
        await page.screenshot(path="page.png", full_page=True)
        await browser.close()

result = asyncio.run(scrape_spa("https://example-spa.com/products"))
print(f"Found {len(result['products'])} products")

Scrapy: rastreamento em grande escala

pip install scrapy
scrapy startproject techpulse_crawler
cd techpulse_crawler
scrapy genspider tech_spider techpulsesite.com

# spiders/tech_spider.py
import scrapy

class TechSpider(scrapy.Spider):
    name = "tech_spider"
    allowed_domains = ["techpulsesite.com"]
    start_urls = ["https://techpulsesite.com/"]
    custom_settings = {
        "DOWNLOAD_DELAY": 1,           # polite: 1s between requests
        "CONCURRENT_REQUESTS": 8,
        "ROBOTSTXT_OBEY": True,
        "USER_AGENT": "TechBot/1.0 (+https://techpulsesite.com)",
    }

    def parse(self, response):
        # Extract article links
        for link in response.css("article a::attr(href)").getall():
            yield response.follow(link, callback=self.parse_article)

        # Follow pagination
        next_page = response.css("a.next-page::attr(href)").get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

    def parse_article(self, response):
        yield {
            "url": response.url,
            "title": response.css("h1::text").get("").strip(),
            "author": response.css(".author-name::text").get(""),
            "date": response.css("time::attr(datetime)").get(""),
            "tags": response.css(".tag::text").getall(),
            "content": " ".join(response.css("article p::text").getall()),
        }

# Run spider
# scrapy crawl tech_spider -o output.json

Lidando com proteção anti-bot

import time, random

# Random delays between requests
time.sleep(random.uniform(1, 3))

# Rotate user agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
]

# Use sessions with cookies
session = requests.Session()
session.headers.update({"User-Agent": random.choice(USER_AGENTS)})

# For Cloudflare-protected sites: use Playwright (renders JS, handles challenges)
# Or commercial services: Bright Data, Oxylabs, ScraperAPI, ZenRows

# Playwright with stealth plugin
from playwright.async_api import async_playwright

async def scrape_protected(url: str):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # non-headless for better scores
        context = await browser.new_context(
            user_agent=random.choice(USER_AGENTS),
            locale="en-US",
            timezone_id="America/New_York",
        )
        page = await context.new_page()
        # Mask automation signals
        await page.add_init_script("delete Object.getPrototypeOf(navigator).webdriver")
        await page.goto(url)
        content = await page.content()
        await browser.close()
        return content

Armazenamento de dados

import json, csv
from pathlib import Path

# JSON
with open("data.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

# CSV
with open("data.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "url", "price", "date"])
    writer.writeheader()
    writer.writerows(results)

# SQLite via Python built-in
import sqlite3
conn = sqlite3.connect("scraped.db")
conn.execute("CREATE TABLE IF NOT EXISTS articles (id INTEGER PRIMARY KEY, title TEXT, url TEXT UNIQUE, content TEXT)")
conn.executemany("INSERT OR IGNORE INTO articles (title, url, content) VALUES (?,?,?)",
                 [(r["title"], r["url"], r["content"]) for r in results])
conn.commit()

Diretrizes Legais e Éticas

Verifique robots.txt— respeite as regras de rastreamento do site
Taxa limite suas solicitações— 1-3 segundos entre solicitações no mínimo
Não sobrecarregue os servidores— poucas conexões simultâneas
Verifique os Termos de Serviço– alguns sites proíbem explicitamente a raspagem
Não use dados copiados comercialmente– sem licenciamento adequado
Use APIs oficiais quando disponíveis– primeira escolha, sempre

O web scraping em Python em 2026 cobre um amplo espectro. Use requests+BeautifulSoup para HTML simples, Playwright para páginas com muito JavaScript e Scrapy para rastreamento em escala de produção. Sempre respeite os sites que você acessa e prefira APIs oficiais, quando disponíveis.

🔗 Share this article

X / Twitter Facebook WhatsApp LinkedIn Telegram