Source code for archivebox.extractors

__package__ = 'archivebox.extractors'

import os

from typing import Optional, List, Iterable
from datetime import datetime

from ..index.schema import Link
from ..index import (
    load_link_details,
    write_link_details,
    patch_main_index,
)
from ..util import enforce_types
from ..logging_util import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
    log_link_archiving_started,
    log_link_archiving_finished,
    log_archive_method_started,
    log_archive_method_finished,
)

from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .singlefile import should_save_singlefile, save_singlefile
from .readability import should_save_readability, save_readability
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
from .git import should_save_git, save_git
from .media import should_save_media, save_media
from .archive_org import should_save_archive_dot_org, save_archive_dot_org

[docs]def get_default_archive_methods(): return [ ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), ('singlefile', should_save_singlefile, save_singlefile), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ]
[docs]@enforce_types def ignore_methods(to_ignore: List[str]): ARCHIVE_METHODS = get_default_archive_methods() methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS) methods = map(lambda x: x[1], methods) return list(methods)