This code performs the following operations: ### Purpose of the Code The...

March 23, 2025 at 05:22 PM

import os
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse, urljoin
from io import BytesIO
from PIL import Image
import hashlib
import time


def download_and_save_image(image_url, base_url, download_dir, min_width=50, min_height=50):
    """Downloads and saves an image, handling relative URLs and duplicates.

    Args:
        image_url: The URL of the image.
        base_url:  The base URL of the webpage (for resolving relative URLs).
        download_dir: The directory to save the images.
        min_width: minimum width of the image to download
        min_height: minimum height of the image to download

    Returns:
        None if the image was not downloaded, otherwise the filename.
    """
    try:
        # Handle relative URLs
        absolute_image_url = urljoin(base_url, image_url)

        # Check if the URL is valid
        parsed_url = urlparse(absolute_image_url)
        if not all([parsed_url.scheme, parsed_url.netloc]):
            print(f"Invalid image URL: {absolute_image_url}")
            return None

        # Download the image
        response = requests.get(absolute_image_url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        # Get image data as bytes
        image_data = BytesIO(response.content)

        # Check Image Dimensions (PIL)
        try:
            img = Image.open(image_data)
            width, height = img.size
            if width < min_width or height < min_height:
                print(f"Image too small ({width}x{height}): {absolute_image_url}")
                return None
        except (OSError, IOError, Image.UnidentifiedImageError) as e:
            print(f"Error opening image with PIL: {absolute_image_url} - {e}")
            return None


        # Generate a unique filename using a hash of the image content
        image_hash = hashlib.md5(response.content).hexdigest()
        file_extension = os.path.splitext(parsed_url.path)[1]  # Get extension from URL
        # Handle cases where there's no extension
        if not file_extension:
            # Try to get the extension from the content type
            content_type = response.headers.get('content-type')
            if content_type:
                if 'jpeg' in content_type or 'jpg' in content_type:
                    file_extension = '.jpg'
                elif 'png' in content_type:
                    file_extension = '.png'
                elif 'gif' in content_type:
                    file_extension = '.gif'
                elif 'webp' in content_type:
                    file_extension = '.webp'
                # Add more content types as needed
                else:
                    file_extension = '.jpg' #default to jpg if not detected
            else:
                file_extension = ".jpg"  # Fallback if no content-type
        filename = f"{image_hash}{file_extension}"
        filepath = os.path.join(download_dir, filename)


        # Check if the file already exists (prevent duplicates)
        if os.path.exists(filepath):
            print(f"Image already downloaded: {absolute_image_url}")
            return None

        # Save the image
        with open(filepath, "wb") as f:
            for chunk in response.iter_content(8192):
                f.write(chunk)

        print(f"Image saved: {absolute_image_url} -> {filename}")
        return filename

    except requests.exceptions.RequestException as e:
        print(f"Error downloading image: {absolute_image_url} - {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None



def detect_and_save_images(url, output_dir="images", min_width=50, min_height=50, delay=2):
    """
    Detects images on a webpage and saves them to a directory.

    Args:
        url: The URL of the webpage.
        output_dir: The directory to save the images.  Defaults to "images".
        min_width: The minimum width (in pixels) of images to save.
        min_height: The minimum height (in pixels) of images to save.
        delay: delay in seconds to wait for the page to load.

    Returns:
        A list of filenames of the downloaded images.
    """

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    downloaded_filenames = []

    try:
        # --- Setup Chrome WebDriver ---
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")

        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # --- Navigate to the URL ---
        driver.get(url)
        time.sleep(delay)

        # --- Find Image Elements ---
        # Find <img> tags
        img_elements = driver.find_elements(By.TAG_NAME, "img")
        image_urls = [img.get_attribute("src") for img in img_elements if img.get_attribute("src")]

        # --- Find elements with CSS background images ---
        # Get all elements, then filter in Python. More robust.
        all_elements = driver.find_elements(By.XPATH, "//*")

        for element in all_elements:
            try:
                bg_image_url = element.value_of_css_property("background-image")
                if bg_image_url and bg_image_url != "none":
                    # Extract URL from CSS value (e.g., url("image.jpg"))
                    bg_image_url = bg_image_url.strip('url(")').strip('")')
                    if bg_image_url:  # Check if it's not empty after stripping
                        image_urls.append(bg_image_url)
            except Exception as e:
                # print(f"Error getting background image: {e}") # Debugging
                pass # Ignore errors getting background images

        # --- Download and Save Images ---
        for image_url in image_urls:
            if image_url: #check for empty strings
                filename = download_and_save_image(image_url, url, output_dir, min_width, min_height)
                if filename:
                    downloaded_filenames.append(filename)


        return downloaded_filenames


    except Exception as e:
        print(f"An error occurred: {e}")
        return []  # Return an empty list on error

    finally:
        if 'driver' in locals():
            driver.quit()



# --- Example Usage ---
if __name__ == "__main__":
    url_to_scrape = "https://vgen.co/kcpasin/portfolio"  # Replace with your desired URL
    # url_to_scrape = "https://www.google.com" # Simple test
    # url_to_scrape = "https://www.wikipedia.org/" # More complex test
    # url_to_scrape = "https://www.amazon.com" # Very complex site

    downloaded_files = detect_and_save_images(url_to_scrape, output_dir="portfolio_images", min_width=100, min_height=100, delay=5)
    print(f"\nDownloaded {len(downloaded_files)} images:")
    for filename in downloaded_files:
        print(filename)

This code performs the following operations:

Purpose of the Code

The script uses Selenium, Requests, and Pillow (PIL) libraries to extract, validate, and save images (both <img> tags and CSS background images) from a webpage. Its main functionality is web scraping to download images and store them locally while ensuring image dimensions and avoiding duplicates.

Workflow and Main Functions

1. `download_and_save_image(image_url, base_url, download_dir, min_width=50, min_height=50)`

Purpose: Downloads and saves an image.
Steps:
- Resolves relative image URLs using the base_url.
- Validates the URL.
- Downloads the image using the requests library.
- Checks the image's width and height to ensure it's above the min_width and min_height.
- Ensures no duplicate downloads by hashing the image's content.
- Saves the image in the specified directory (download_dir) with a filename based on the image's hash and extension.
Returns: The saved filename or None if the image is too small, invalid, or already exists.

2. `detect_and_save_images(url, output_dir="images", min_width=50, min_height=50, delay=2)`

Purpose: Detects and saves all images from a webpage.
Steps:
1. Creates the output directory (output_dir) if it doesn't exist.
2. Sets up a headless Chrome WebDriver using selenium and webdriver_manager.
3. Opens the given webpage (url).
4. Waits for the page to load (time.sleep(delay)).
5. Finds <img> tags and extracts their src attributes.
6. Detects elements with background images defined in CSS and extracts image URLs.
7. Calls the download_and_save_image function for each detected image URL.
Returns: A list of filenames of successfully downloaded images.

Example Usage:

The script is runnable as a standalone program via the if __name__ == "__main__": block. Here's what happens in the example:

The script scrapes the URL https://vgen.co/kcpasin/portfolio.
The images are saved in the portfolio_images directory, with a minimum width/height of 100px and a 5-second page load delay.
It logs the total count of downloaded images and their filenames.

Key Features:

Image Validation and Deduplication:
- Ensures downloaded images meet minimum dimensions (width and height).
- Prevents downloading duplicate images by using MD5 hashes of the image content.
Supports Both <img> and CSS Background Images:
- Searches for <img> tags.
- Inspects all elements for CSS "background-image" properties.
Dynamic Webpage Support:
- Uses Selenium WebDriver to manage dynamic content loading.
- Allows custom delays (delay parameter) to wait for the page to fully load.
Error Handling:
- Handles network errors, invalid URLs, and unsupported image formats using exception handling.
- Logs warnings instead of breaking execution on encountering problems.
Customizable Parameters:
- Directory to store images (output_dir).
- Minimum image size (min_width and min_height).
- URL to scrape and page load delays.

Output:

After execution, all valid images are saved in the specified folder (e.g., portfolio_images).
Outputs a list of filenames in the console with the total count of successfully downloaded images.

Dependencies:

Requests for downloading images.
Selenium for automated webpage interaction.
Pillow (PIL) for handling and validating images.
Webdriver-Manager to manage the ChromeDriver installation.

In summary, this code is a web scraping utility for downloading images from webpages, optimized for handling both inline and CSS-based images.

Generate your own explanations

Download our vscode extension

Read other generated explanations

Built by @thebuilderjr