Taking one screenshot is easy. Taking thousands while handling failures, managing memory, and maintaining speed is another challenge entirely. Let me show you how to build a robust bulk screenshot system.
Basic Batch Processing
Let’s start simple—processing a list of URLs:
from playwright.sync_api import sync_playwright
urls = [ 'https://example.com', 'https://github.com', 'https://stackoverflow.com',]
with sync_playwright() as p: browser = p.chromium.launch()
for i, url in enumerate(urls): page = browser.new_page() try: page.goto(url, timeout=30000) page.screenshot(path=f'screenshot_{i}.png') except Exception as e: print(f'Failed: {url} - {e}') finally: page.close()
browser.close()This works, but it’s slow. Each screenshot waits for the previous one to complete.
Concurrent Processing with Async
The real power comes from async processing:
import asynciofrom playwright.async_api import async_playwright
async def take_screenshot(browser, url, output_path): """Take a single screenshot.""" page = await browser.new_page() try: await page.goto(url, timeout=30000) await page.wait_for_load_state('networkidle', timeout=10000) await page.screenshot(path=output_path, full_page=True) return {'url': url, 'status': 'success', 'path': output_path} except Exception as e: return {'url': url, 'status': 'failed', 'error': str(e)} finally: await page.close()
async def process_urls(urls): """Process multiple URLs concurrently.""" async with async_playwright() as p: browser = await p.chromium.launch()
tasks = [ take_screenshot(browser, url, f'screenshots/{i}.png') for i, url in enumerate(urls) ]
results = await asyncio.gather(*tasks) await browser.close()
return results
# Run iturls = ['https://example.com', 'https://github.com', 'https://stackoverflow.com']results = asyncio.run(process_urls(urls))
for r in results: print(f"{r['url']}: {r['status']}")This processes all URLs concurrently. On my machine, it’s about 5x faster than sequential processing.
Reading URLs from CSV
For real-world use, you’ll often read URLs from a file:
import asyncioimport csvfrom playwright.async_api import async_playwrightfrom pathlib import Pathfrom urllib.parse import urlparse
def sanitize_filename(url): """Convert URL to safe filename.""" parsed = urlparse(url) name = parsed.netloc.replace('.', '_') return f"{name}.png"
async def take_screenshot(browser, url, output_dir): """Take screenshot with safe filename.""" filename = sanitize_filename(url) output_path = output_dir / filename
page = await browser.new_page() try: await page.goto(url, timeout=30000) await page.wait_for_load_state('networkidle', timeout=10000) await page.screenshot(path=str(output_path), full_page=True) return {'url': url, 'status': 'success', 'path': str(output_path)} except Exception as e: return {'url': url, 'status': 'failed', 'error': str(e)} finally: await page.close()
async def process_csv(csv_path, output_dir): """Process URLs from CSV file.""" output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True)
# Read URLs from CSV urls = [] with open(csv_path, 'r') as f: reader = csv.reader(f) next(reader) # Skip header urls = [row[0] for row in reader if row]
print(f"Processing {len(urls)} URLs...")
async with async_playwright() as p: browser = await p.chromium.launch()
tasks = [take_screenshot(browser, url, output_dir) for url in urls] results = await asyncio.gather(*tasks)
await browser.close()
# Summary success = sum(1 for r in results if r['status'] == 'success') failed = len(results) - success print(f"Complete: {success} success, {failed} failed")
return results
# Usageasyncio.run(process_csv('urls.csv', 'screenshots'))Batch Processing for Large Volumes
Processing thousands of URLs at once will exhaust memory. Use batching:
import asynciofrom playwright.async_api import async_playwright
async def take_screenshot(browser, url, output_path): page = await browser.new_page() try: await page.goto(url, timeout=30000) await page.screenshot(path=output_path, full_page=True) return {'url': url, 'status': 'success'} except Exception as e: return {'url': url, 'status': 'failed', 'error': str(e)} finally: await page.close()
async def process_batch(browser, batch, start_index): """Process a batch of URLs.""" tasks = [ take_screenshot(browser, url, f'screenshots/{start_index + i}.png') for i, url in enumerate(batch) ] return await asyncio.gather(*tasks)
async def process_all_urls(urls, batch_size=10): """Process all URLs in batches.""" all_results = []
async with async_playwright() as p: browser = await p.chromium.launch()
for i in range(0, len(urls), batch_size): batch = urls[i:i + batch_size] print(f"Processing batch {i//batch_size + 1} ({len(batch)} URLs)...")
results = await process_batch(browser, batch, i) all_results.extend(results)
# Optional: brief pause between batches await asyncio.sleep(0.5)
await browser.close()
return all_results
# Usageurls = [f'https://example{i}.com' for i in range(100)]results = asyncio.run(process_all_urls(urls, batch_size=10))Rate Limiting
Some servers may rate-limit or block rapid requests. Add delays:
import asynciofrom playwright.async_api import async_playwright
class RateLimiter: def __init__(self, requests_per_second=2): self.delay = 1.0 / requests_per_second self.last_request = 0
async def wait(self): import time now = time.time() wait_time = self.last_request + self.delay - now if wait_time > 0: await asyncio.sleep(wait_time) self.last_request = time.time()
async def take_screenshot_with_rate_limit(browser, url, output_path, limiter): await limiter.wait() page = await browser.new_page() try: await page.goto(url, timeout=30000) await page.screenshot(path=output_path) return {'url': url, 'status': 'success'} except Exception as e: return {'url': url, 'status': 'failed', 'error': str(e)} finally: await page.close()Progress Tracking
For long-running jobs, track progress:
import asynciofrom playwright.async_api import async_playwrightfrom tqdm import tqdm
async def process_with_progress(urls): results = []
async with async_playwright() as p: browser = await p.chromium.launch()
with tqdm(total=len(urls), desc="Screenshots") as pbar: for i, url in enumerate(urls): result = await take_screenshot(browser, url, f'screenshots/{i}.png') results.append(result) pbar.update(1) pbar.set_postfix({ 'success': sum(1 for r in results if r['status'] == 'success'), 'failed': sum(1 for r in results if r['status'] == 'failed') })
await browser.close()
return resultsRetry Failed URLs
Don’t lose failed screenshots—retry them:
import asynciofrom playwright.async_api import async_playwright
async def take_screenshot_with_retry(browser, url, output_path, max_retries=3): """Take screenshot with automatic retry.""" for attempt in range(max_retries): page = await browser.new_page() try: await page.goto(url, timeout=30000) await page.screenshot(path=output_path) await page.close() return {'url': url, 'status': 'success', 'attempts': attempt + 1} except Exception as e: await page.close() if attempt == max_retries - 1: return {'url': url, 'status': 'failed', 'error': str(e)} await asyncio.sleep(2 ** attempt) # Exponential backoff
async def process_with_retry(urls, batch_size=10): results = [] failed = []
async with async_playwright() as p: browser = await p.chromium.launch()
# First pass for i in range(0, len(urls), batch_size): batch = urls[i:i + batch_size] batch_results = await asyncio.gather(*[ take_screenshot_with_retry(browser, url, f'screenshots/{i+j}.png') for j, url in enumerate(batch) ]) results.extend(batch_results)
await browser.close()
# Log failures failed = [r for r in results if r['status'] == 'failed'] if failed: with open('failed_urls.txt', 'w') as f: for r in failed: f.write(f"{r['url']}\t{r.get('error', 'Unknown')}\n")
return resultsMemory Management
Long-running screenshot jobs can leak memory. Tips:
- Close pages after each screenshot
- Restart browser periodically
- Process in batches
async def process_with_browser_restart(urls, batch_size=50, restart_every=200): """Restart browser periodically to prevent memory leaks.""" results = []
async with async_playwright() as p: browser = await p.chromium.launch() screenshots_since_restart = 0
for i, url in enumerate(urls): # Restart browser if needed if screenshots_since_restart >= restart_every: await browser.close() browser = await p.chromium.launch() screenshots_since_restart = 0 print(f"Browser restarted at screenshot {i}")
result = await take_screenshot(browser, url, f'screenshots/{i}.png') results.append(result) screenshots_since_restart += 1
await browser.close()
return resultsWhen to Use an API Instead
Playwright is great for moderate volumes, but at scale you’ll face:
- Server costs for compute resources
- Memory management complexity
- Browser crashes and recovery
- Rate limiting from target sites
For high-volume screenshot automation, a screenshot API like ScreenshotOne handles the infrastructure. See the comparison in our bulk screenshots API guide.
Complete Production Script
Here’s a production-ready script combining all techniques:
import asyncioimport csvimport jsonfrom pathlib import Pathfrom datetime import datetimefrom playwright.async_api import async_playwrightfrom urllib.parse import urlparse
class BulkScreenshotter: def __init__(self, output_dir='screenshots', batch_size=10, max_retries=3): self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) self.batch_size = batch_size self.max_retries = max_retries self.results = []
def _get_filename(self, url, index): parsed = urlparse(url) safe_name = parsed.netloc.replace('.', '_').replace(':', '_') return f"{index:05d}_{safe_name}.png"
async def _screenshot(self, browser, url, index): filename = self._get_filename(url, index) output_path = self.output_dir / filename
for attempt in range(self.max_retries): page = await browser.new_page() try: await page.goto(url, timeout=30000) await page.wait_for_load_state('networkidle', timeout=10000) await page.screenshot(path=str(output_path), full_page=True) await page.close() return { 'url': url, 'status': 'success', 'path': str(output_path), 'attempts': attempt + 1 } except Exception as e: await page.close() if attempt == self.max_retries - 1: return { 'url': url, 'status': 'failed', 'error': str(e), 'attempts': attempt + 1 } await asyncio.sleep(2 ** attempt)
async def process(self, urls): async with async_playwright() as p: browser = await p.chromium.launch()
for i in range(0, len(urls), self.batch_size): batch = urls[i:i + self.batch_size] tasks = [ self._screenshot(browser, url, i + j) for j, url in enumerate(batch) ] batch_results = await asyncio.gather(*tasks) self.results.extend(batch_results)
success = sum(1 for r in self.results if r['status'] == 'success') print(f"Progress: {len(self.results)}/{len(urls)} ({success} success)")
await browser.close()
self._save_report() return self.results
def _save_report(self): report = { 'timestamp': datetime.now().isoformat(), 'total': len(self.results), 'success': sum(1 for r in self.results if r['status'] == 'success'), 'failed': sum(1 for r in self.results if r['status'] == 'failed'), 'results': self.results } with open(self.output_dir / 'report.json', 'w') as f: json.dump(report, f, indent=2)
# Usageurls = ['https://example.com', 'https://github.com']screenshotter = BulkScreenshotter(batch_size=5)results = asyncio.run(screenshotter.process(urls))Summary
Building a bulk screenshot system:
- Use async Playwright for concurrent processing
- Process in batches to manage memory
- Implement retry logic for resilience
- Track progress and save reports
- Restart browser periodically for long jobs
Frequently Asked Questions
If you read the article, but still have questions. Please, check the most frequently asked. And if you still have questions, feel free reach out at support@screenshotone.com.
How to automatically screenshot multiple websites in Python?
Use Playwright's async API with asyncio.gather() to process multiple URLs concurrently. Read URLs from a CSV or list, create async tasks for each, and process them in batches to manage memory.
How to handle failed screenshots in bulk processing?
Implement try-except blocks around each screenshot, log failures to a separate file, and optionally retry failed URLs. Keep track of success/failure counts for monitoring.
How to speed up bulk screenshot processing?
Use async Playwright with concurrent processing, reuse browser contexts, process in batches, and consider using a screenshot API for very large volumes. Concurrent processing can be 5-10x faster than sequential.