Source code for archivebox.extractors

__package__ = 'archivebox.extractors'

import os
import sys
from pathlib import Path

from typing import Callable, Optional, List, Iterable, Union
from datetime import datetime, timezone
from django.db.models import QuerySet

from ..config import (
    SAVE_ALLOWLIST_PTN,
    SAVE_DENYLIST_PTN,
)
from ..core.settings import ERROR_LOG
from ..index.schema import ArchiveResult, Link
from ..index.sql import write_link_to_sql_index
from ..index import (
    load_link_details,
    write_link_details,
)
from ..util import enforce_types
from ..logging_util import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
    log_link_archiving_started,
    log_link_archiving_finished,
    log_archive_method_started,
    log_archive_method_finished,
)
from ..search import write_search_index

from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .singlefile import should_save_singlefile, save_singlefile
from .readability import should_save_readability, save_readability
from .mercury import should_save_mercury, save_mercury
from .htmltotext import should_save_htmltotext, save_htmltotext
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
from .git import should_save_git, save_git
from .media import should_save_media, save_media
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
from .headers import should_save_headers, save_headers


ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]

[docs] def get_default_archive_methods() -> List[ArchiveMethodEntry]: return [ ('favicon', should_save_favicon, save_favicon), ('headers', should_save_headers, save_headers), ('singlefile', should_save_singlefile, save_singlefile), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('wget', should_save_wget, save_wget), # keep title, readability, and htmltotext below wget and singlefile, as they depend on them ('title', should_save_title, save_title), ('readability', should_save_readability, save_readability), ('mercury', should_save_mercury, save_mercury), ('htmltotext', should_save_htmltotext, save_htmltotext), ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ]
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [ ('readability', 1), ('mercury', 2), ('htmltotext', 3), ('singlefile', 4), ('dom', 5), ('wget', 6) ]
[docs] @enforce_types def ignore_methods(to_ignore: List[str]) -> Iterable[str]: ARCHIVE_METHODS = get_default_archive_methods() return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]