This code performs the following operations: ### Purpose of the Code The...

March 23, 2025 at 05:22 PM

import os import requests from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from urllib.parse import urlparse, urljoin from io import BytesIO from PIL import Image import hashlib import time def download_and_save_image(image_url, base_url, download_dir, min_width=50, min_height=50): """Downloads and saves an image, handling relative URLs and duplicates. Args: image_url: The URL of the image. base_url: The base URL of the webpage (for resolving relative URLs). download_dir: The directory to save the images. min_width: minimum width of the image to download min_height: minimum height of the image to download Returns: None if the image was not downloaded, otherwise the filename. """ try: # Handle relative URLs absolute_image_url = urljoin(base_url, image_url) # Check if the URL is valid parsed_url = urlparse(absolute_image_url) if not all([parsed_url.scheme, parsed_url.netloc]): print(f"Invalid image URL: {absolute_image_url}") return None # Download the image response = requests.get(absolute_image_url, stream=True) response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) # Get image data as bytes image_data = BytesIO(response.content) # Check Image Dimensions (PIL) try: img = Image.open(image_data) width, height = img.size if width < min_width or height < min_height: print(f"Image too small ({width}x{height}): {absolute_image_url}") return None except (OSError, IOError, Image.UnidentifiedImageError) as e: print(f"Error opening image with PIL: {absolute_image_url} - {e}") return None # Generate a unique filename using a hash of the image content image_hash = hashlib.md5(response.content).hexdigest() file_extension = os.path.splitext(parsed_url.path)[1] # Get extension from URL # Handle cases where there's no extension if not file_extension: # Try to get the extension from the content type content_type = response.headers.get('content-type') if content_type: if 'jpeg' in content_type or 'jpg' in content_type: file_extension = '.jpg' elif 'png' in content_type: file_extension = '.png' elif 'gif' in content_type: file_extension = '.gif' elif 'webp' in content_type: file_extension = '.webp' # Add more content types as needed else: file_extension = '.jpg' #default to jpg if not detected else: file_extension = ".jpg" # Fallback if no content-type filename = f"{image_hash}{file_extension}" filepath = os.path.join(download_dir, filename) # Check if the file already exists (prevent duplicates) if os.path.exists(filepath): print(f"Image already downloaded: {absolute_image_url}") return None # Save the image with open(filepath, "wb") as f: for chunk in response.iter_content(8192): f.write(chunk) print(f"Image saved: {absolute_image_url} -> {filename}") return filename except requests.exceptions.RequestException as e: print(f"Error downloading image: {absolute_image_url} - {e}") return None except Exception as e: print(f"An unexpected error occurred: {e}") return None def detect_and_save_images(url, output_dir="images", min_width=50, min_height=50, delay=2): """ Detects images on a webpage and saves them to a directory. Args: url: The URL of the webpage. output_dir: The directory to save the images. Defaults to "images". min_width: The minimum width (in pixels) of images to save. min_height: The minimum height (in pixels) of images to save. delay: delay in seconds to wait for the page to load. Returns: A list of filenames of the downloaded images. """ # Create the output directory if it doesn't exist if not os.path.exists(output_dir): os.makedirs(output_dir) downloaded_filenames = [] try: # --- Setup Chrome WebDriver --- chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) # --- Navigate to the URL --- driver.get(url) time.sleep(delay) # --- Find Image Elements --- # Find <img> tags img_elements = driver.find_elements(By.TAG_NAME, "img") image_urls = [img.get_attribute("src") for img in img_elements if img.get_attribute("src")] # --- Find elements with CSS background images --- # Get all elements, then filter in Python. More robust. all_elements = driver.find_elements(By.XPATH, "//*") for element in all_elements: try: bg_image_url = element.value_of_css_property("background-image") if bg_image_url and bg_image_url != "none": # Extract URL from CSS value (e.g., url("image.jpg")) bg_image_url = bg_image_url.strip('url(")').strip('")') if bg_image_url: # Check if it's not empty after stripping image_urls.append(bg_image_url) except Exception as e: # print(f"Error getting background image: {e}") # Debugging pass # Ignore errors getting background images # --- Download and Save Images --- for image_url in image_urls: if image_url: #check for empty strings filename = download_and_save_image(image_url, url, output_dir, min_width, min_height) if filename: downloaded_filenames.append(filename) return downloaded_filenames except Exception as e: print(f"An error occurred: {e}") return [] # Return an empty list on error finally: if 'driver' in locals(): driver.quit() # --- Example Usage --- if __name__ == "__main__": url_to_scrape = "https://vgen.co/kcpasin/portfolio" # Replace with your desired URL # url_to_scrape = "https://www.google.com" # Simple test # url_to_scrape = "https://www.wikipedia.org/" # More complex test # url_to_scrape = "https://www.amazon.com" # Very complex site downloaded_files = detect_and_save_images(url_to_scrape, output_dir="portfolio_images", min_width=100, min_height=100, delay=5) print(f"\nDownloaded {len(downloaded_files)} images:") for filename in downloaded_files: print(filename)

This code performs the following operations:

Purpose of the Code

The script uses Selenium, Requests, and Pillow (PIL) libraries to extract, validate, and save images (both <img> tags and CSS background images) from a webpage. Its main functionality is web scraping to download images and store them locally while ensuring image dimensions and avoiding duplicates.


Workflow and Main Functions

1. download_and_save_image(image_url, base_url, download_dir, min_width=50, min_height=50)

  • Purpose: Downloads and saves an image.
  • Steps:
    • Resolves relative image URLs using the base_url.
    • Validates the URL.
    • Downloads the image using the requests library.
    • Checks the image's width and height to ensure it's above the min_width and min_height.
    • Ensures no duplicate downloads by hashing the image's content.
    • Saves the image in the specified directory (download_dir) with a filename based on the image's hash and extension.
  • Returns: The saved filename or None if the image is too small, invalid, or already exists.

2. detect_and_save_images(url, output_dir="images", min_width=50, min_height=50, delay=2)

  • Purpose: Detects and saves all images from a webpage.
  • Steps:
    1. Creates the output directory (output_dir) if it doesn't exist.
    2. Sets up a headless Chrome WebDriver using selenium and webdriver_manager.
    3. Opens the given webpage (url).
    4. Waits for the page to load (time.sleep(delay)).
    5. Finds <img> tags and extracts their src attributes.
    6. Detects elements with background images defined in CSS and extracts image URLs.
    7. Calls the download_and_save_image function for each detected image URL.
  • Returns: A list of filenames of successfully downloaded images.

Example Usage:

The script is runnable as a standalone program via the if __name__ == "__main__": block. Here's what happens in the example:

  • The script scrapes the URL https://vgen.co/kcpasin/portfolio.
  • The images are saved in the portfolio_images directory, with a minimum width/height of 100px and a 5-second page load delay.
  • It logs the total count of downloaded images and their filenames.

Key Features:

  1. Image Validation and Deduplication:

    • Ensures downloaded images meet minimum dimensions (width and height).
    • Prevents downloading duplicate images by using MD5 hashes of the image content.
  2. Supports Both <img> and CSS Background Images:

    • Searches for <img> tags.
    • Inspects all elements for CSS "background-image" properties.
  3. Dynamic Webpage Support:

    • Uses Selenium WebDriver to manage dynamic content loading.
    • Allows custom delays (delay parameter) to wait for the page to fully load.
  4. Error Handling:

    • Handles network errors, invalid URLs, and unsupported image formats using exception handling.
    • Logs warnings instead of breaking execution on encountering problems.
  5. Customizable Parameters:

    • Directory to store images (output_dir).
    • Minimum image size (min_width and min_height).
    • URL to scrape and page load delays.

Output:

  • After execution, all valid images are saved in the specified folder (e.g., portfolio_images).
  • Outputs a list of filenames in the console with the total count of successfully downloaded images.

Dependencies:

  • Requests for downloading images.
  • Selenium for automated webpage interaction.
  • Pillow (PIL) for handling and validating images.
  • Webdriver-Manager to manage the ChromeDriver installation.

In summary, this code is a web scraping utility for downloading images from webpages, optimized for handling both inline and CSS-based images.

Generate your own explanations
Download our vscode extension
Read other generated explanations

Built by @thebuilderjr
Sponsored by beam analytics
Read our terms and privacy policy
Forked from openai-quickstart-node