Last active 2 weeks ago

Fast parallel image processing script to crop whitespace and resize images. Removes white space from left and right, ensures minimum height and/or width. Uses multiprocessing for high-speed batch processing of large image collections.

insidiousfiddler's Avatar insidiousfiddler revised this gist 2 weeks ago. Go to revision

1 file changed, 253 insertions

batch_crop_resize.py(file created)

@@ -0,0 +1,253 @@
1 + #!/usr/bin/env python3
2 + """
3 + Fast parallel image processing script to crop whitespace and resize images.
4 + Removes white space from left and right, ensures minimum height and/or width.
5 + Uses multiprocessing for high-speed batch processing of large image collections.
6 + """
7 +
8 + import os
9 + import sys
10 + from pathlib import Path
11 + from PIL import Image, ImageChops
12 + import argparse
13 + from multiprocessing import Pool, cpu_count
14 + from functools import partial
15 + from tqdm import tqdm
16 +
17 + # Allow processing of very large images
18 + Image.MAX_IMAGE_PIXELS = None
19 +
20 +
21 + def trim_whitespace(img, tolerance=10):
22 + """
23 + Remove whitespace from left and right sides of image.
24 +
25 + Args:
26 + img: PIL Image object
27 + tolerance: How much variation from white to accept (0-255)
28 +
29 + Returns:
30 + Cropped PIL Image object
31 + """
32 + # Convert to RGB if needed
33 + if img.mode != 'RGB':
34 + img = img.convert('RGB')
35 +
36 + # Create a background image that's solid white
37 + bg = Image.new('RGB', img.size, (255, 255, 255))
38 +
39 + # Get difference between image and white background
40 + diff = ImageChops.difference(img, bg)
41 +
42 + # Convert to grayscale and get bounding box
43 + diff = diff.convert('L')
44 +
45 + # Apply tolerance - values below tolerance become 0 (considered white)
46 + bbox = diff.point(lambda x: 0 if x < tolerance else 255).getbbox()
47 +
48 + if bbox:
49 + # Crop to remove whitespace on left and right only
50 + left, top, right, bottom = bbox
51 + # Keep original top and bottom, only crop left and right
52 + return img.crop((left, 0, right, img.height))
53 +
54 + return img
55 +
56 +
57 + def resize_to_min_dimensions(img, min_height=1400, min_width=None):
58 + """
59 + Resize image to ensure minimum dimensions while maintaining aspect ratio.
60 +
61 + Args:
62 + img: PIL Image object
63 + min_height: Minimum height in pixels
64 + min_width: Minimum width in pixels (optional)
65 +
66 + Returns:
67 + Resized PIL Image object
68 + """
69 + width, height = img.size
70 +
71 + # Calculate scale factors needed for each dimension
72 + scale_height = min_height / height if height < min_height else 1.0
73 + scale_width = min_width / width if min_width and width < min_width else 1.0
74 +
75 + # Use the larger scale to ensure both minimums are met
76 + scale = max(scale_height, scale_width)
77 +
78 + if scale > 1.0:
79 + new_width = int(width * scale)
80 + new_height = int(height * scale)
81 + img = img.resize((new_width, new_height), Image.LANCZOS)
82 +
83 + return img
84 +
85 +
86 + def process_image(input_path, output_dir, min_height=1400, min_width=None, tolerance=10):
87 + """
88 + Process a single image: crop whitespace and resize.
89 +
90 + Args:
91 + input_path: Path to input image
92 + output_dir: Directory to save processed image
93 + min_height: Minimum height in pixels
94 + min_width: Minimum width in pixels (optional)
95 + tolerance: Whitespace detection tolerance
96 +
97 + Returns:
98 + Tuple of (success: bool, filename: str, dimensions: tuple, error: str)
99 + """
100 + try:
101 + # Open image
102 + img = Image.open(input_path)
103 +
104 + # Trim whitespace from left and right
105 + img = trim_whitespace(img, tolerance)
106 +
107 + # Resize to ensure minimum dimensions
108 + img = resize_to_min_dimensions(img, min_height, min_width)
109 +
110 + # Create output path
111 + output_path = Path(output_dir) / Path(input_path).name
112 +
113 + # Save processed image
114 + img.save(output_path, quality=95, optimize=True)
115 +
116 + return (True, Path(input_path).name, img.size, None)
117 +
118 + except Exception as e:
119 + return (False, Path(input_path).name, None, str(e))
120 +
121 +
122 + def process_directory(input_dir, output_dir, min_height=1400, min_width=None, tolerance=10, workers=None):
123 + """
124 + Process all images in a directory using parallel processing.
125 +
126 + Args:
127 + input_dir: Input directory path
128 + output_dir: Output directory path
129 + min_height: Minimum height in pixels
130 + min_width: Minimum width in pixels (optional)
131 + tolerance: Whitespace detection tolerance
132 + workers: Number of parallel workers (None = auto-detect)
133 + """
134 + # Create output directory if it doesn't exist
135 + Path(output_dir).mkdir(parents=True, exist_ok=True)
136 +
137 + # Supported image formats
138 + image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}
139 +
140 + # Get all image files
141 + input_path = Path(input_dir)
142 + image_files = [
143 + f for f in input_path.iterdir()
144 + if f.is_file() and f.suffix.lower() in image_extensions
145 + ]
146 +
147 + if not image_files:
148 + print(f"No image files found in {input_dir}")
149 + return
150 +
151 + # Determine number of workers
152 + if workers is None:
153 + workers = max(1, cpu_count() - 1) # Leave one core free
154 +
155 + print(f"Found {len(image_files)} images to process")
156 + print(f"Using {workers} parallel workers\n")
157 +
158 + # Create partial function with fixed parameters
159 + process_func = partial(
160 + process_image,
161 + output_dir=output_dir,
162 + min_height=min_height,
163 + min_width=min_width,
164 + tolerance=tolerance
165 + )
166 +
167 + # Process images in parallel with progress bar
168 + success_count = 0
169 + error_count = 0
170 +
171 + with Pool(processes=workers) as pool:
172 + results = list(tqdm(
173 + pool.imap(process_func, image_files),
174 + total=len(image_files),
175 + desc="Processing images",
176 + unit="img"
177 + ))
178 +
179 + # Print results
180 + print("\n" + "="*60)
181 + for success, filename, dimensions, error in results:
182 + if success:
183 + success_count += 1
184 + print(f"✓ {filename} -> {dimensions[0]}x{dimensions[1]}")
185 + else:
186 + error_count += 1
187 + print(f"✗ {filename}: {error}")
188 +
189 + print("="*60)
190 + print(f"\nProcessing complete!")
191 + print(f" Success: {success_count}/{len(image_files)}")
192 + if error_count > 0:
193 + print(f" Errors: {error_count}/{len(image_files)}")
194 + print(f" Output: {output_dir}")
195 +
196 +
197 + def main():
198 + parser = argparse.ArgumentParser(
199 + description='Crop whitespace and resize images to minimum dimensions (parallel processing)'
200 + )
201 + parser.add_argument(
202 + 'input_dir',
203 + help='Input directory containing images'
204 + )
205 + parser.add_argument(
206 + 'output_dir',
207 + help='Output directory for processed images'
208 + )
209 + parser.add_argument(
210 + '--min-height',
211 + type=int,
212 + default=1400,
213 + help='Minimum height in pixels (default: 1400)'
214 + )
215 + parser.add_argument(
216 + '--min-width',
217 + type=int,
218 + default=None,
219 + help='Minimum width in pixels (optional)'
220 + )
221 + parser.add_argument(
222 + '--tolerance',
223 + type=int,
224 + default=10,
225 + help='Whitespace detection tolerance 0-255 (default: 10)'
226 + )
227 + parser.add_argument(
228 + '--workers',
229 + type=int,
230 + default=None,
231 + help=f'Number of parallel workers (default: auto = {max(1, cpu_count() - 1)})'
232 + )
233 +
234 + args = parser.parse_args()
235 +
236 + # Validate input directory
237 + if not Path(args.input_dir).exists():
238 + print(f"Error: Input directory '{args.input_dir}' does not exist")
239 + sys.exit(1)
240 +
241 + # Process images
242 + process_directory(
243 + args.input_dir,
244 + args.output_dir,
245 + args.min_height,
246 + args.min_width,
247 + args.tolerance,
248 + args.workers
249 + )
250 +
251 +
252 + if __name__ == '__main__':
253 + main()
Newer Older