#!/usr/bin/env python3 """ Fast parallel image processing script to crop whitespace and resize images. Removes white space from left and right, ensures minimum height and/or width. Uses multiprocessing for high-speed batch processing of large image collections. """ import os import sys from pathlib import Path from PIL import Image, ImageChops import argparse from multiprocessing import Pool, cpu_count from functools import partial from tqdm import tqdm # Allow processing of very large images Image.MAX_IMAGE_PIXELS = None def trim_whitespace(img, tolerance=10): """ Remove whitespace from left and right sides of image. Args: img: PIL Image object tolerance: How much variation from white to accept (0-255) Returns: Cropped PIL Image object """ # Convert to RGB if needed if img.mode != 'RGB': img = img.convert('RGB') # Create a background image that's solid white bg = Image.new('RGB', img.size, (255, 255, 255)) # Get difference between image and white background diff = ImageChops.difference(img, bg) # Convert to grayscale and get bounding box diff = diff.convert('L') # Apply tolerance - values below tolerance become 0 (considered white) bbox = diff.point(lambda x: 0 if x < tolerance else 255).getbbox() if bbox: # Crop to remove whitespace on left and right only left, top, right, bottom = bbox # Keep original top and bottom, only crop left and right return img.crop((left, 0, right, img.height)) return img def resize_to_min_dimensions(img, min_height=1400, min_width=None): """ Resize image to ensure minimum dimensions while maintaining aspect ratio. Args: img: PIL Image object min_height: Minimum height in pixels min_width: Minimum width in pixels (optional) Returns: Resized PIL Image object """ width, height = img.size # Calculate scale factors needed for each dimension scale_height = min_height / height if height < min_height else 1.0 scale_width = min_width / width if min_width and width < min_width else 1.0 # Use the larger scale to ensure both minimums are met scale = max(scale_height, scale_width) if scale > 1.0: new_width = int(width * scale) new_height = int(height * scale) img = img.resize((new_width, new_height), Image.LANCZOS) return img def process_image(input_path, output_dir, min_height=1400, min_width=None, tolerance=10): """ Process a single image: crop whitespace and resize. Args: input_path: Path to input image output_dir: Directory to save processed image min_height: Minimum height in pixels min_width: Minimum width in pixels (optional) tolerance: Whitespace detection tolerance Returns: Tuple of (success: bool, filename: str, dimensions: tuple, error: str) """ try: # Open image img = Image.open(input_path) # Trim whitespace from left and right img = trim_whitespace(img, tolerance) # Resize to ensure minimum dimensions img = resize_to_min_dimensions(img, min_height, min_width) # Create output path output_path = Path(output_dir) / Path(input_path).name # Save processed image img.save(output_path, quality=95, optimize=True) return (True, Path(input_path).name, img.size, None) except Exception as e: return (False, Path(input_path).name, None, str(e)) def process_directory(input_dir, output_dir, min_height=1400, min_width=None, tolerance=10, workers=None): """ Process all images in a directory using parallel processing. Args: input_dir: Input directory path output_dir: Output directory path min_height: Minimum height in pixels min_width: Minimum width in pixels (optional) tolerance: Whitespace detection tolerance workers: Number of parallel workers (None = auto-detect) """ # Create output directory if it doesn't exist Path(output_dir).mkdir(parents=True, exist_ok=True) # Supported image formats image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'} # Get all image files input_path = Path(input_dir) image_files = [ f for f in input_path.iterdir() if f.is_file() and f.suffix.lower() in image_extensions ] if not image_files: print(f"No image files found in {input_dir}") return # Determine number of workers if workers is None: workers = max(1, cpu_count() - 1) # Leave one core free print(f"Found {len(image_files)} images to process") print(f"Using {workers} parallel workers\n") # Create partial function with fixed parameters process_func = partial( process_image, output_dir=output_dir, min_height=min_height, min_width=min_width, tolerance=tolerance ) # Process images in parallel with progress bar success_count = 0 error_count = 0 with Pool(processes=workers) as pool: results = list(tqdm( pool.imap(process_func, image_files), total=len(image_files), desc="Processing images", unit="img" )) # Print results print("\n" + "="*60) for success, filename, dimensions, error in results: if success: success_count += 1 print(f"✓ {filename} -> {dimensions[0]}x{dimensions[1]}") else: error_count += 1 print(f"✗ {filename}: {error}") print("="*60) print(f"\nProcessing complete!") print(f" Success: {success_count}/{len(image_files)}") if error_count > 0: print(f" Errors: {error_count}/{len(image_files)}") print(f" Output: {output_dir}") def main(): parser = argparse.ArgumentParser( description='Crop whitespace and resize images to minimum dimensions (parallel processing)' ) parser.add_argument( 'input_dir', help='Input directory containing images' ) parser.add_argument( 'output_dir', help='Output directory for processed images' ) parser.add_argument( '--min-height', type=int, default=1400, help='Minimum height in pixels (default: 1400)' ) parser.add_argument( '--min-width', type=int, default=None, help='Minimum width in pixels (optional)' ) parser.add_argument( '--tolerance', type=int, default=10, help='Whitespace detection tolerance 0-255 (default: 10)' ) parser.add_argument( '--workers', type=int, default=None, help=f'Number of parallel workers (default: auto = {max(1, cpu_count() - 1)})' ) args = parser.parse_args() # Validate input directory if not Path(args.input_dir).exists(): print(f"Error: Input directory '{args.input_dir}' does not exist") sys.exit(1) # Process images process_directory( args.input_dir, args.output_dir, args.min_height, args.min_width, args.tolerance, args.workers ) if __name__ == '__main__': main()