Source code for data_manipulation.beautifulsoup_
import re
from typing import TYPE_CHECKING, Optional
from loguru import logger
if TYPE_CHECKING:
from bs4 import BeautifulSoup
# Constants
DEFAULT_TIMEOUT = 10
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
[docs]
def preprocess(html: str) -> Optional[str]:
"""Removes whitespaces and newline characters from HTML string.
Args:
html (str): HTML string to be cleaned.
Returns:
Optional[str]: Cleaned HTML string with normalized whitespace.
Examples:
>>> a = "<html> <p> Something </p> </html> "
>>> preprocess(a)
'<html><p>Something</p></html>'
Note:
Reference: https://stackoverflow.com/questions/23241641
"""
# remove leading and trailing whitespaces
pattern = re.compile("(^[\s]+)|([\s]+$)", re.MULTILINE)
html = re.sub(pattern, "", html)
# convert newlines to spaces, this preserves newline delimiters
html = re.sub("\n", " ", html)
# remove whitespaces before opening tags
html = re.sub("[\s]+<", "<", html)
# remove whitespaces after closing tags
html = re.sub(">[\s]+", ">", html)
return html
[docs]
def build_soup(
url: str,
features: str = "lxml",
to_preprocess: bool = True,
timeout: int = DEFAULT_TIMEOUT,
headers: Optional[dict] = None,
) -> "Optional[BeautifulSoup]":
"""Creates a BeautifulSoup object from a given URL.
Args:
url (str): URL to fetch and parse.
features (str, optional): Parser to use. Defaults to "lxml".
to_preprocess (bool, optional): Whether to preprocess the HTML. Defaults to True.
timeout (int, optional): Request timeout in seconds. Defaults to 10.
headers (Optional[dict], optional): Custom headers for the request. Defaults to None.
Returns:
Optional[BeautifulSoup]: Parsed BeautifulSoup object, or None if request fails.
Examples:
>>> a = build_soup("https://google.com")
>>> type(a)
<class 'bs4.BeautifulSoup'>
Note:
Requires requests and beautifulsoup4 packages.
"""
import requests
from bs4 import BeautifulSoup
if not url or not url.strip():
raise ValueError("URL cannot be empty")
if not url.startswith(("http://", "https://")):
raise ValueError("URL must start with http:// or https://")
request_headers = headers or DEFAULT_HEADERS
try:
with requests.Session() as session:
response = session.get(url, headers=request_headers, timeout=timeout)
response.raise_for_status()
html_content = preprocess(response.text) if to_preprocess else response.text
return BeautifulSoup(html_content, features=features)
except requests.RequestException as e:
logger.error(f"Failed to fetch URL {url}: {str(e)}")
return None
if __name__ == "__main__":
import doctest
doctest.testmod()