Source code for data_manipulation.beautifulsoup_

import re
from typing import TYPE_CHECKING, Optional

from loguru import logger

if TYPE_CHECKING:
    from bs4 import BeautifulSoup

# Constants
DEFAULT_TIMEOUT = 10
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}



[docs]
def preprocess(html: str) -> Optional[str]:
    """Removes whitespaces and newline characters from HTML string.

    Args:
        html (str): HTML string to be cleaned.

    Returns:
        Optional[str]: Cleaned HTML string with normalized whitespace.

    Examples:
        >>> a = "<html>   <p> Something </p>    </html> "
        >>> preprocess(a)
        '<html><p>Something</p></html>'

    Note:
        Reference: https://stackoverflow.com/questions/23241641
    """

    # remove leading and trailing whitespaces
    pattern = re.compile("(^[\s]+)|([\s]+$)", re.MULTILINE)
    html = re.sub(pattern, "", html)
    # convert newlines to spaces, this preserves newline delimiters
    html = re.sub("\n", " ", html)
    # remove whitespaces before opening tags
    html = re.sub("[\s]+<", "<", html)
    # remove whitespaces after closing tags
    html = re.sub(">[\s]+", ">", html)
    return html




[docs]
def build_soup(
    url: str,
    features: str = "lxml",
    to_preprocess: bool = True,
    timeout: int = DEFAULT_TIMEOUT,
    headers: Optional[dict] = None,
) -> "Optional[BeautifulSoup]":
    """Creates a BeautifulSoup object from a given URL.

    Args:
        url (str): URL to fetch and parse.
        features (str, optional): Parser to use. Defaults to "lxml".
        to_preprocess (bool, optional): Whether to preprocess the HTML. Defaults to True.
        timeout (int, optional): Request timeout in seconds. Defaults to 10.
        headers (Optional[dict], optional): Custom headers for the request. Defaults to None.

    Returns:
        Optional[BeautifulSoup]: Parsed BeautifulSoup object, or None if request fails.

    Examples:
        >>> a = build_soup("https://google.com")
        >>> type(a)
        <class 'bs4.BeautifulSoup'>

    Note:
        Requires requests and beautifulsoup4 packages.
    """
    import requests
    from bs4 import BeautifulSoup

    if not url or not url.strip():
        raise ValueError("URL cannot be empty")

    if not url.startswith(("http://", "https://")):
        raise ValueError("URL must start with http:// or https://")

    request_headers = headers or DEFAULT_HEADERS

    try:
        with requests.Session() as session:
            response = session.get(url, headers=request_headers, timeout=timeout)
            response.raise_for_status()

            html_content = preprocess(response.text) if to_preprocess else response.text
            return BeautifulSoup(html_content, features=features)

    except requests.RequestException as e:
        logger.error(f"Failed to fetch URL {url}: {str(e)}")
        return None



if __name__ == "__main__":
    import doctest

    doctest.testmod()