Source code for wdoc.utils.loaders.shared

import re
import signal
from contextlib import contextmanager
from functools import cache as memoize
from functools import wraps

from beartype.typing import Callable, Union
from langchain_core.documents import Document

from wdoc.utils.env import env

markdownimage_regex = re.compile(
    r"!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])", flags=re.MULTILINE
)


def debug_return_empty(func: Callable) -> Callable:
    if env.WDOC_EMPTY_LOADER:
        import uuid6

        @wraps(func)
        def wrapper(*args, **kwargs):
            metadata = {
                "debug_empty": True,
                "content_hash": str(uuid6.uuid6()),
                "all_hash": str(uuid6.uuid6()),
            }
            metadata.update(kwargs)
            out = [
                Document(
                    page_content="Lorem Ipsum",
                    metadata=metadata,
                )
            ]
            return out

        return wrapper
    else:
        return func


@contextmanager
def signal_timeout(timeout: int, exception: Exception):
    "disabled in some joblib backend"
    assert timeout > 0, f"Invalid timeout: {timeout}"

    def signal_handler(signum, frame):
        raise exception("Timeout occurred")

    # Set the signal handler and an alarm
    disabled = False
    try:
        signal.signal(signal.SIGALRM, signal_handler)
    except Exception:
        disabled = True

    if disabled:
        yield
    else:
        signal.alarm(timeout)

        try:
            yield
        finally:
            # Disable the alarm
            signal.alarm(0)


[docs] @memoize def get_url_title(url: str) -> Union[str, type(None)]: """if the title of the url is not loaded from the loader, trying as last resort with this one""" from langchain_community.document_loaders import WebBaseLoader loader = WebBaseLoader(url, raise_for_status=True) docs = loader.load() if "title" in docs[0].metadata and docs[0].metadata["title"]: return docs[0].metadata["title"] else: return None