Source code for archivebox.extractors

__package__ = 'archivebox.extractors'

import os
from pathlib import Path

from typing import Optional, List, Iterable, Union
from datetime import datetime, timezone
from django.db.models import QuerySet

from ..index.schema import Link
from ..index.sql import write_link_to_sql_index
from ..index import (
    load_link_details,
    write_link_details,
)
from ..util import enforce_types
from ..logging_util import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
    log_link_archiving_started,
    log_link_archiving_finished,
    log_archive_method_started,
    log_archive_method_finished,
)
from ..search import write_search_index

from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .singlefile import should_save_singlefile, save_singlefile
from .readability import should_save_readability, save_readability
from .mercury import should_save_mercury, save_mercury
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
from .git import should_save_git, save_git
from .media import should_save_media, save_media
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
from .headers import should_save_headers, save_headers


[docs]def get_default_archive_methods(): return [ ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), ('headers', should_save_headers, save_headers), ('singlefile', should_save_singlefile, save_singlefile), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('wget', should_save_wget, save_wget), ('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them ('mercury', should_save_mercury, save_mercury), ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ]
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
[docs]@enforce_types def ignore_methods(to_ignore: List[str]): ARCHIVE_METHODS = get_default_archive_methods() methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS) methods = map(lambda x: x[0], methods) return list(methods)