Production-Ready Web Scraping at Scale
Web scraping has evolved from simple data extraction scripts to sophisticated systems capable of processing millions of pages. Building production-ready scrapers requires more than just parsing HTML—it demands robust architecture, anti-detection strategies, and efficient data processing pipelines.
Advanced Scrapy Architecture
Here’s how to build a scalable scraping system with proper error handling:
# scrapers/distributed_spider.py
from scrapy import Spider
from scrapy.utils.project import get_project_settings
import redis
class DistributedSpider(Spider):
def __init__(self):
self.redis_client = redis.Redis(host='localhost', port=6379, db=0)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def start_requests(self):
# Get URLs from Redis queue
while True:
url = self.redis_client.lpop('scraper_queue')
if not url:
break
yield scrapy.Request(url=url.decode('utf-8'))
def handle_error(self, failure):
# Robust error handling with retry logic
request = failure.request
if failure.check(HttpError):
self.logger.error(f"HTTP Error on {request.url}: {failure.value}")
elif failure.check(DNSLookupError):
self.logger.error(f"DNS Error on {request.url}")
elif failure.check(TimeoutError):
self.logger.error(f"Timeout on {request.url}")
# Re-queue failed URLs with exponential backoff
retry_count = request.meta.get('retry_count', 0)
if retry_count < 3:
delay = 2 ** retry_count
self.crawler.engine.schedule(
request.replace(meta={**request.meta, 'retry_count': retry_count + 1}),
delay
)
# Task queue integration with Celery
@app.task(bind=True, max_retries=3)
def scrape_website(self, spider_name, url):
try:
process = CrawlerProcess(get_project_settings())
process.crawl(spider_name, start_urls=[url])
process.start()
except Exception as exc:
raise self.retry(exc=exc, countdown=60)
2. Anti-Detection Strategies
The key to avoiding detection is to behave like a human:
class StealthMiddleware:
def __init__(self):
self.user_agents = self.load_user_agents()
self.proxy_pool = self.initialize_proxy_pool()
def process_request(self, request, spider):
# Rotate user agents
request.headers['User-Agent'] = random.choice(self. user_agents)
# Use residential proxies for sensitive sites
if self.is_sensitive_domain(request.url):
request.meta['proxy'] = self.get_residential_proxy()
else:
request.meta['proxy'] = self.get_datacenter_proxy()
# Add random delays
delay = random.uniform(1, 3)
time.sleep(delay)
# Mimic browser headers
request.headers.update({
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
})
3. Handling JavaScript-Heavy Sites
Modern websites rely heavily on JavaScript. Here’s how to handle them efficiently:
from playwright.async_api import async_playwright
class PlaywrightSpider:
async def parse_with_js(self, url):
async with async_playwright() as p:
browser = await p.chromium.launch(
proxy={"server": self.get_proxy()},
args=['--disable-blink-features=AutomationControlled']
)
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent=self.get_user_agent()
)
# Inject stealth scripts
await context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
page = await context.new_page()
await page.goto(url, wait_until='networkidle')
# Wait for dynamic content
await page.wait_for_selector('.product-grid', timeout=30000)
content = await page.content()
await browser.close()
return content
Data Quality Assurance
1. Schema Validation
from pydantic import BaseModel, validator
from typing import Optional
class ProductSchema(BaseModel):
name: str
price: float
availability: bool
description: Optional[str]
@validator('price')
def validate_price(cls, v):
if v <= 0:
raise ValueError('Price must be positive')
return round(v, 2)
@validator('name')
def validate_name(cls, v):
if len(v.strip()) < 3:
raise ValueError('Name too short')
return v.strip()
2. Deduplication Pipeline
class DeduplicationPipeline:
def __init__(self):
self.seen_items = BloomFilter(
capacity=10000000,
error_rate=0.001
)
def process_item(self, item, spider):
# Create unique fingerprint
fingerprint = self.create_fingerprint(item)
if fingerprint in self.seen_items:
raise DropItem(f"Duplicate item: {item['name']}")
self.seen_items.add(fingerprint)
return item
Monitoring and Alerting
Real-time Monitoring Dashboard
from prometheus_client import Counter, Histogram, Gauge
# Metrics
pages_scraped = Counter('scraper_pages_total', 'Total pages scraped')
scrape_duration = Histogram('scraper_duration_seconds', 'Scrape duration')
active_spiders = Gauge('scraper_active_spiders', 'Number of active spiders')
error_rate = Counter('scraper_errors_total', 'Total scraping errors')
class MonitoringMiddleware:
def process_response(self, request, response, spider):
pages_scraped.inc()
if response.status >= 400:
error_rate.inc()
return response
Scaling Strategies
1. Horizontal Scaling with Kubernetes
apiVersion: apps/v1
kind: Deployment
metadata:
name: scraper-workers
spec:
replicas: 10
selector:
matchLabels:
app: scraper
template:
metadata:
labels:
app: scraper
spec:
containers:
- name: scraper
image: scraper:latest
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
env:
- name: CONCURRENT_REQUESTS
value: "16"
- name: DOWNLOAD_DELAY
value: "2"
2. Auto-scaling Based on Queue Size
def auto_scale_workers():
queue_size = redis_client.llen('scraper_queue')
current_workers = get_current_worker_count()
if queue_size > 10000 and current_workers < MAX_WORKERS:
scale_up_workers(min(queue_size // 1000, MAX_WORKERS))
elif queue_size < 1000 and current_workers > MIN_WORKERS:
scale_down_workers(max(MIN_WORKERS, current_workers // 2))
Best Practices
- Respect robots.txt: Always check and follow website policies
- Implement exponential backoff: For failed requests
- Use connection pooling: Reuse connections for better performance
- Cache DNS lookups: Reduce DNS resolution overhead
- Compress data: Store and transfer compressed data
- Monitor everything: You can’t improve what you don’t measure
Performance Optimization Tips
1. Async Processing
import asyncio
import aiohttp
async def fetch_many(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_one(session, url) for url in urls]
return await asyncio.gather(*tasks)
async def fetch_one(session, url):
async with session.get(url) as response:
return await response.text()
2. Memory-Efficient Parsing
from lxml import etree
def parse_large_xml(file_path):
# Use iterparse for memory efficiency
for event, elem in etree.iterparse(file_path, events=('start', 'end')):
if event == 'end' and elem.tag == 'product':
yield extract_product_data(elem)
elem.clear() # Free memory
while elem.getprevious() is not None:
del elem.getparent()[0]
Conclusion
Building production-ready web scrapers requires thinking beyond just extracting data. It’s about creating resilient, scalable systems that can adapt to changing websites while maintaining high performance and data quality.
The key is to start with a solid architecture, implement robust error handling, and continuously monitor and optimize your scrapers. With the right approach, you can build systems that reliably process millions of pages while staying under the radar.
Remember: with great scraping power comes great responsibility. Always respect website terms of service and implement rate limiting to be a good citizen of the web.
Next Steps
- Implement distributed tracing for better debugging
- Explore machine learning for adaptive scraping strategies
- Build a custom proxy rotation service
- Create a visual scraping rule builder
Happy scraping! 🕷️
About Surendra Tamang
Software Engineer specializing in web scraping, data engineering, and full-stack development. Passionate about transforming complex data challenges into elegant solutions that drive business value.
Continue Reading
Explore more insights and technical articles from our blog
Get More Technical Insights
Subscribe to receive weekly articles on web scraping, data engineering, and software development. Join 1000+ developers and engineers who trust our content.
No spam. Unsubscribe anytime.