Source code for wdoc.utils.loaders.url
import re
import goose3
from beartype.typing import List
from langchain_core.documents import Document
from langchain_community.document_loaders import (
PlaywrightURLLoader,
SeleniumURLLoader,
UnstructuredURLLoader,
WebBaseLoader,
)
from loguru import logger
from wdoc.utils.loaders.shared import (
debug_return_empty,
get_url_title,
markdownimage_regex,
)
from wdoc.utils.misc import (
check_docs_tkn_length,
doc_loaders_cache,
optional_strip_unexp_args,
)
markdownlinkparser_regex = re.compile(r"\[([^\]]+)\]\(http[s]?://[^)]+\)")
[docs]
def md_shorten_image_name(md_image: re.Match) -> str:
"turn a markdown image link into just the name"
name = md_image.group(1)
if len(name) <= 16:
return name
else:
return name[:8] + "…" + name[-8:]
@debug_return_empty
@optional_strip_unexp_args
@doc_loaders_cache.cache
def load_url(path: str, title=None) -> List[Document]:
logger.info(f"Loading url: '{path}'")
# even if loading fails the title might be found so trying to keep
# the first working title across trials
if title == "Untitled":
title = None
loaded_success = False
if not loaded_success:
try:
loader = WebBaseLoader("https://r.jina.ai/" + path, raise_for_status=True)
text = "\n".join([doc.page_content for doc in loader.load()]).strip()
assert text, "Empty text"
if not title:
if text.splitlines()[0].startswith("Title: "):
title = text.splitlines()[0].replace("Title: ", "", 1)
text = text.split("Markdown Content:", 1)[1]
text = markdownlinkparser_regex.sub(r"\1", text) # remove links
# remove markdown images for now as caption is disabled so it's just base64 or something like that, keep only a shorten image name
text = markdownimage_regex.sub(md_shorten_image_name, text)
docs = [
Document(
page_content=text,
metadata={
"parser": "jinareader",
},
)
]
if title:
for doc in docs:
doc.metadata["title"] = title
check_docs_tkn_length(docs, path)
loaded_success = True
except Exception as err:
logger.warning(f"Exception when using jina reader to parse url: '{err}'")
if not loaded_success:
try:
loader = PlaywrightURLLoader(
urls=[path], remove_selectors=["header", "footer"]
)
docs = loader.load()
assert docs, "Empty docs when using playwright"
if not title and "title" in docs[0].metadata:
title = docs[0].metadata["title"]
check_docs_tkn_length(docs, path)
loaded_success = True
except Exception as err:
logger.warning(f"Exception when using playwright to parse url: '{err}'")
if not loaded_success:
try:
loader = SeleniumURLLoader(urls=[path], browser="firefox")
docs = loader.load()
assert docs, "Empty docs when using selenium firefox"
if (
not title
and "title" in docs[0].metadata
and docs[0].metadata["title"] != "No title found."
):
title = docs[0].metadata["title"]
check_docs_tkn_length(docs, path)
loaded_success = True
except Exception as err:
logger.warning(
f"Exception when using selenium firefox to parse url: '{err}'"
)
if not loaded_success:
try:
loader = SeleniumURLLoader(urls=[path], browser="chrome")
docs = loader.load()
assert docs, "Empty docs when using selenium chrome"
if (
not title
and "title" in docs[0].metadata
and docs[0].metadata["title"] != "No title found."
):
title = docs[0].metadata["title"]
check_docs_tkn_length(docs, path)
loaded_success = True
except Exception as err:
logger.warning(
f"Exception when using selenium chrome to parse url: '{err}'\nUsing goose as fallback"
)
if not loaded_success:
try:
g = goose3.Goose()
article = g.extract(url=path)
text = article.cleaned_text
docs = [Document(page_content=text)]
assert docs, "Empty docs when using goose"
if not title:
if "title" in docs[0].metadata and docs[0].metadata["title"]:
title = docs[0].metadata["title"]
elif article.title:
title = article.title
check_docs_tkn_length(docs, path)
loaded_success = True
except Exception as err:
logger.warning(f"Exception when using goose to parse url: '{err}'")
if not loaded_success:
try:
loader = UnstructuredURLLoader([path])
docs = loader.load()
assert docs, "Empty docs when using UnstructuredURLLoader"
if not title and "title" in docs[0].metadata and docs[0].metadata["title"]:
title = docs[0].metadata["title"]
check_docs_tkn_length(docs, path)
loaded_success = True
except Exception as err:
logger.warning(
f"Exception when using UnstructuredURLLoader to parse url: '{err}'"
)
if not loaded_success:
try:
loader = WebBaseLoader(path, raise_for_status=True)
docs = loader.load()
assert docs, "Empty docs when using html"
if not title and "title" in docs[0].metadata and docs[0].metadata["title"]:
title = docs[0].metadata["title"]
check_docs_tkn_length(docs, path)
loaded_success = True
except Exception as err:
logger.warning(
f"Exception when using html as LAST RESORT to parse url: '{err}'"
)
# last resort, try to get the title from the most basic loader
if not title:
title = get_url_title(path)
# store the title as metadata if missing
if title:
for d in docs:
if "title" not in d.metadata or not d.metadata["title"]:
d.metadata["title"] = title
else:
if d.metadata["title"] != title:
d.metadata["title"] = f"{title} - {d.metadata['title']}"
return docs