Source code for archivebox.main

__package__ = 'archivebox'

import os
import sys
import shutil

from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices

from .cli import (
    list_subcommands,
    run_subcommand,
    display_first,
    meta_cmds,
    main_cmds,
    archive_cmds,
)
from .parsers import (
    save_text_as_source,
    save_file_as_source,
    parse_links_memory,
)
from .index.schema import Link
from .util import enforce_types                         # type: ignore
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import (
    load_main_index,
    parse_links_from_source,
    dedupe_links,
    write_main_index,
    link_matches_filter,
    get_indexed_folders,
    get_archived_folders,
    get_unarchived_folders,
    get_present_folders,
    get_valid_folders,
    get_invalid_folders,
    get_duplicate_folders,
    get_orphaned_folders,
    get_corrupted_folders,
    get_unrecognized_folders,
    fix_invalid_folder_locations,
)
from .index.json import (
    parse_json_main_index,
    parse_json_links_details,
)
from .index.sql import (
    parse_sql_main_index,
    get_admins,
    apply_migrations,
    remove_from_sql_main_index,
)
from .index.html import parse_html_main_index
from .extractors import archive_links, archive_link, ignore_methods
from .config import (
    stderr,
    hint,
    ConfigDict,
    ANSI,
    IS_TTY,
    IN_DOCKER,
    USER,
    ARCHIVEBOX_BINARY,
    ONLY_NEW,
    OUTPUT_DIR,
    SOURCES_DIR,
    ARCHIVE_DIR,
    LOGS_DIR,
    CONFIG_FILE,
    ARCHIVE_DIR_NAME,
    SOURCES_DIR_NAME,
    LOGS_DIR_NAME,
    STATIC_DIR_NAME,
    JSON_INDEX_FILENAME,
    HTML_INDEX_FILENAME,
    SQL_INDEX_FILENAME,
    ROBOTS_TXT_FILENAME,
    FAVICON_FILENAME,
    check_dependencies,
    check_data_folder,
    write_config_file,
    setup_django,
    VERSION,
    CODE_LOCATIONS,
    EXTERNAL_LOCATIONS,
    DATA_LOCATIONS,
    DEPENDENCIES,
    load_all_config,
    CONFIG,
    USER_CONFIG,
    get_real_name,
)
from .logging_util import (
    TERM_WIDTH,
    TimedProgress,
    log_importing_started,
    log_crawl_started,
    log_removal_started,
    log_removal_finished,
    log_list_started,
    log_list_finished,
    printable_config,
    printable_folders,
    printable_filesize,
    printable_folder_status,
    printable_dependency_version,
)


ALLOWED_IN_OUTPUT_DIR = {
    '.DS_Store',
    '.venv',
    'venv',
    'virtualenv',
    '.virtualenv',
    'node_modules',
    'package-lock.json',
    ARCHIVE_DIR_NAME,
    SOURCES_DIR_NAME,
    LOGS_DIR_NAME,
    STATIC_DIR_NAME,
    SQL_INDEX_FILENAME,
    JSON_INDEX_FILENAME,
    HTML_INDEX_FILENAME,
    ROBOTS_TXT_FILENAME,
    FAVICON_FILENAME,
}

[docs]@enforce_types def help(out_dir: str=OUTPUT_DIR) -> None: """Print the ArchiveBox help message and usage""" all_subcommands = list_subcommands() COMMANDS_HELP_TEXT = '\n '.join( f'{cmd.ljust(20)} {summary}' for cmd, summary in all_subcommands.items() if cmd in meta_cmds ) + '\n\n ' + '\n '.join( f'{cmd.ljust(20)} {summary}' for cmd, summary in all_subcommands.items() if cmd in main_cmds ) + '\n\n ' + '\n '.join( f'{cmd.ljust(20)} {summary}' for cmd, summary in all_subcommands.items() if cmd in archive_cmds ) + '\n\n ' + '\n '.join( f'{cmd.ljust(20)} {summary}' for cmd, summary in all_subcommands.items() if cmd not in display_first ) if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)): print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset} {lightred}Active data directory:{reset} {} {lightred}Usage:{reset} archivebox [command] [--help] [--version] [...args] {lightred}Commands:{reset} {} {lightred}Example Use:{reset} mkdir my-archive; cd my-archive/ archivebox init archivebox status archivebox add https://example.com/some/page archivebox add --depth=1 ~/Downloads/bookmarks_export.html archivebox list --sort=timestamp --csv=timestamp,url,is_archived archivebox schedule --every=day https://example.com/some/feed.rss archivebox update --resume=15109948213.123 {lightred}Documentation:{reset} https://github.com/pirate/ArchiveBox/wiki '''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI)) else: print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI)) print() if IN_DOCKER: print('When using Docker, you need to mount a volume to use as your data dir:') print(' docker run -v /some/path:/data archivebox ...') print() print('To import an existing archive (from a previous version of ArchiveBox):') print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:') print(' 2. archivebox init') print() print('To start a new archive:') print(' 1. Create an empty directory, then cd into it and run:') print(' 2. archivebox init') print() print('For more information, see the documentation here:') print(' https://github.com/pirate/ArchiveBox/wiki')
[docs]@enforce_types def version(quiet: bool=False, out_dir: str=OUTPUT_DIR) -> None: """Print the ArchiveBox version and dependency information""" if quiet: print(VERSION) else: print('ArchiveBox v{}'.format(VERSION)) print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) for name, dependency in DEPENDENCIES.items(): print(printable_dependency_version(name, dependency)) print() print('{white}[i] Code locations:{reset}'.format(**ANSI)) for name, folder in CODE_LOCATIONS.items(): print(printable_folder_status(name, folder)) print() print('{white}[i] Secrets locations:{reset}'.format(**ANSI)) for name, folder in EXTERNAL_LOCATIONS.items(): print(printable_folder_status(name, folder)) if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']: print() print('{white}[i] Data locations:{reset}'.format(**ANSI)) for name, folder in DATA_LOCATIONS.items(): print(printable_folder_status(name, folder)) print() check_dependencies()
[docs]@enforce_types def run(subcommand: str, subcommand_args: Optional[List[str]], stdin: Optional[IO]=None, out_dir: str=OUTPUT_DIR) -> None: """Run a given ArchiveBox subcommand with the given list of args""" run_subcommand( subcommand=subcommand, subcommand_args=subcommand_args, stdin=stdin, pwd=out_dir, )
[docs]@enforce_types def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" os.makedirs(out_dir, exist_ok=True) is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)) if is_empty and not existing_index: print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI)) print(f' {out_dir}') print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) elif existing_index: print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI)) print(f' {out_dir}') print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) else: if force: stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow') stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).') else: stderr( ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n" " You must run init in a completely empty directory, or an existing data folder.\n\n" " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n" " then run and run 'archivebox init' to pick up where you left off.\n\n" " (Always make sure your data folder is backed up first before updating ArchiveBox)" ).format(out_dir, **ANSI) ) raise SystemExit(2) if existing_index: print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI)) else: print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI)) os.makedirs(SOURCES_DIR, exist_ok=True) print(f' √ {SOURCES_DIR}') os.makedirs(ARCHIVE_DIR, exist_ok=True) print(f' √ {ARCHIVE_DIR}') os.makedirs(LOGS_DIR, exist_ok=True) print(f' √ {LOGS_DIR}') write_config_file({}, out_dir=out_dir) print(f' √ {CONFIG_FILE}') if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)): print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI)) else: print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI)) setup_django(out_dir, check_db=False) DATABASE_FILE = os.path.join(out_dir, SQL_INDEX_FILENAME) print(f' √ {DATABASE_FILE}') print() for migration_line in apply_migrations(out_dir): print(f' {migration_line}') assert os.path.exists(DATABASE_FILE) # from django.contrib.auth.models import User # if IS_TTY and not User.objects.filter(is_superuser=True).exists(): # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI)) # call_command("createsuperuser", interactive=True) print() print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) all_links: Dict[str, Link] = {} if existing_index: all_links = { link.url: link for link in load_main_index(out_dir=out_dir, warn=False) } print(' √ Loaded {} links from existing main index.'.format(len(all_links))) # Links in data folders that dont match their timestamp fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) if fixed: print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) if cant_fix: print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) # Links in JSON index but not in main index orphaned_json_links = { link.url: link for link in parse_json_main_index(out_dir) if link.url not in all_links } if orphaned_json_links: all_links.update(orphaned_json_links) print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) # Links in SQL index but not in main index orphaned_sql_links = { link.url: link for link in parse_sql_main_index(out_dir) if link.url not in all_links } if orphaned_sql_links: all_links.update(orphaned_sql_links) print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI)) # Links in data dir indexes but not in main index orphaned_data_dir_links = { link.url: link for link in parse_json_links_details(out_dir) if link.url not in all_links } if orphaned_data_dir_links: all_links.update(orphaned_data_dir_links) print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) # Links in invalid/duplicate data dirs invalid_folders = { folder: link for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items() } if invalid_folders: print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) print() print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) print(' archivebox status') print(' archivebox list --status=invalid') write_main_index(list(all_links.values()), out_dir=out_dir) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) else: print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI)) print() print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI)) print(' archivebox server # then visit http://127.0.0.1:8000') print() print(' To add new links, you can run:') print(" archivebox add ~/some/path/or/url/to/list_of_links.txt") print() print(' For more usage and examples, run:') print(' archivebox help')
[docs]@enforce_types def status(out_dir: str=OUTPUT_DIR) -> None: """Print out some info and statistics about the archive collection""" check_data_folder(out_dir=out_dir) from core.models import Snapshot from django.contrib.auth import get_user_model User = get_user_model() print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI)) print(ANSI['lightyellow'], f' {out_dir}/*', ANSI['reset']) num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') size = printable_filesize(num_bytes) print(f' Index size: {size} across {num_files} files') print() links = list(load_main_index(out_dir=out_dir)) num_json_links = len(links) num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir)) num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir)) num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})') print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})') print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') if num_html_links != len(links) or num_sql_links != len(links): print() print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI)) print(' archivebox init') print() print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI)) print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset']) num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files in {num_dirs} directories') print(ANSI['black']) num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) num_archived = len(get_archived_folders(links, out_dir=out_dir)) num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') num_present = len(get_present_folders(links, out_dir=out_dir)) num_valid = len(get_valid_folders(links, out_dir=out_dir)) print() print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') duplicate = get_duplicate_folders(links, out_dir=out_dir) orphaned = get_orphaned_folders(links, out_dir=out_dir) corrupted = get_corrupted_folders(links, out_dir=out_dir) unrecognized = get_unrecognized_folders(links, out_dir=out_dir) num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') print(ANSI['reset']) if num_indexed: print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI)) print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)') if orphaned: print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI)) print(' archivebox init') if num_invalid: print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI)) print(' archivebox init') print() print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI)) print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset']) users = get_admins().values_list('username', flat=True) print(f' UI users {len(users)}: {", ".join(users)}') last_login = User.objects.order_by('last_login').last() if last_login: print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}') last_updated = Snapshot.objects.order_by('updated').last() print(f' Last changes: {str(last_updated.updated)[:16]}') if not users: print() print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI)) print(' archivebox manage createsuperuser') print() for snapshot in Snapshot.objects.order_by('-updated')[:10]: if not snapshot.updated: continue print( ANSI['black'], ( f' > {str(snapshot.updated)[:16]} ' f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' f'"{snapshot.title}": {snapshot.url}' )[:TERM_WIDTH()], ANSI['reset'], ) print(ANSI['black'], ' ...', ANSI['reset'])
[docs]@enforce_types def oneshot(url: str, out_dir: str=OUTPUT_DIR): """ Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. You can run this to archive single pages without needing to create a whole collection with archivebox init. """ oneshot_link, _ = parse_links_memory([url]) if len(oneshot_link) > 1: stderr( '[X] You should pass a single url to the oneshot command', color='red' ) raise SystemExit(2) methods = ignore_methods(['title']) archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, skip_index=True) return oneshot_link
[docs]@enforce_types def add(urls: Union[str, List[str]], depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, overwrite: bool=False, init: bool=False, out_dir: str=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' if init: run_subcommand('init', stdin=None, pwd=out_dir) # Load list of links from the existing index check_data_folder(out_dir=out_dir) check_dependencies() all_links: List[Link] = [] new_links: List[Link] = [] all_links = load_main_index(out_dir=out_dir) log_importing_started(urls=urls, depth=depth, index_only=index_only) if isinstance(urls, str): # save verbatim stdin to sources write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir) elif isinstance(urls, list): # save verbatim args to sources write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) new_links += parse_links_from_source(write_ahead_log, root_url=None) # If we're going one level deeper, download each link and look for more links new_links_depth = [] if new_links and depth == 1: log_crawl_started(new_links) for new_link in new_links: downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) all_links, new_links = dedupe_links(all_links, imported_links) write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) if index_only: return all_links # Run the archive methods for each link if update_all: archive_links(all_links, overwrite=overwrite, out_dir=out_dir) elif overwrite: archive_links(imported_links, overwrite=True, out_dir=out_dir) elif new_links: archive_links(new_links, overwrite=False, out_dir=out_dir) else: # nothing was updated, don't bother re-saving the index return all_links # Step 4: Re-write links index with updated titles, icons, and resources all_links = load_main_index(out_dir=out_dir) write_main_index(links=list(all_links), out_dir=out_dir, finished=True) return all_links
[docs]@enforce_types def remove(filter_str: Optional[str]=None, filter_patterns: Optional[List[str]]=None, filter_type: str='exact', links: Optional[List[Link]]=None, after: Optional[float]=None, before: Optional[float]=None, yes: bool=False, delete: bool=False, out_dir: str=OUTPUT_DIR) -> List[Link]: """Remove the specified URLs from the archive""" check_data_folder(out_dir=out_dir) if links is None: if filter_str and filter_patterns: stderr( '[X] You should pass either a pattern as an argument, ' 'or pass a list of patterns via stdin, but not both.\n', color='red', ) raise SystemExit(2) elif not (filter_str or filter_patterns): stderr( '[X] You should pass either a pattern as an argument, ' 'or pass a list of patterns via stdin.', color='red', ) stderr() hint(('To remove all urls you can run:', 'archivebox remove --filter-type=regex ".*"')) stderr() raise SystemExit(2) elif filter_str: filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] log_list_started(filter_patterns, filter_type) timer = TimedProgress(360, prefix=' ') try: links = list(list_links( filter_patterns=filter_patterns, filter_type=filter_type, after=after, before=before, )) finally: timer.end() if not len(links): log_removal_finished(0, 0) raise SystemExit(1) log_list_finished(links) log_removal_started(links, yes=yes, delete=delete) timer = TimedProgress(360, prefix=' ') try: to_keep = [] to_delete = [] all_links = load_main_index(out_dir=out_dir) for link in all_links: should_remove = ( (after is not None and float(link.timestamp) < after) or (before is not None and float(link.timestamp) > before) or link_matches_filter(link, filter_patterns or [], filter_type) or link in links ) if should_remove: to_delete.append(link) if delete: shutil.rmtree(link.link_dir, ignore_errors=True) else: to_keep.append(link) finally: timer.end() remove_from_sql_main_index(links=to_delete, out_dir=out_dir) write_main_index(links=to_keep, out_dir=out_dir, finished=True) log_removal_finished(len(all_links), len(to_keep)) return to_keep
[docs]@enforce_types def update(resume: Optional[float]=None, only_new: bool=ONLY_NEW, index_only: bool=False, overwrite: bool=False, filter_patterns_str: Optional[str]=None, filter_patterns: Optional[List[str]]=None, filter_type: Optional[str]=None, status: Optional[str]=None, after: Optional[str]=None, before: Optional[str]=None, out_dir: str=OUTPUT_DIR) -> List[Link]: """Import any new links from subscriptions and retry any previously failed/skipped links""" check_data_folder(out_dir=out_dir) check_dependencies() # Step 1: Load list of links from the existing index # merge in and dedupe new links from import_path all_links: List[Link] = [] new_links: List[Link] = [] all_links = load_main_index(out_dir=out_dir) # Step 2: Write updated index with deduped old and new links back to disk write_main_index(links=list(all_links), out_dir=out_dir) # Step 3: Filter for selected_links matching_links = list_links( filter_patterns=filter_patterns, filter_type=filter_type, before=before, after=after, ) matching_folders = list_folders( links=list(matching_links), status=status, out_dir=out_dir, ) all_links = [link for link in matching_folders.values() if link] if index_only: return all_links # Step 3: Run the archive methods for each link to_archive = new_links if only_new else all_links if resume: to_archive = [ link for link in to_archive if link.timestamp >= str(resume) ] if not to_archive: stderr('') stderr(f'[√] Nothing found to resume after {resume}', color='green') return all_links archive_links(to_archive, overwrite=overwrite, out_dir=out_dir) # Step 4: Re-write links index with updated titles, icons, and resources all_links = load_main_index(out_dir=out_dir) write_main_index(links=list(all_links), out_dir=out_dir, finished=True) return all_links
[docs]@enforce_types def list_all(filter_patterns_str: Optional[str]=None, filter_patterns: Optional[List[str]]=None, filter_type: str='exact', status: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None, sort: Optional[str]=None, csv: Optional[str]=None, json: bool=False, out_dir: str=OUTPUT_DIR) -> Iterable[Link]: """List, filter, and export information about archive entries""" check_data_folder(out_dir=out_dir) if filter_patterns and filter_patterns_str: stderr( '[X] You should either pass filter patterns as an arguments ' 'or via stdin, but not both.\n', color='red', ) raise SystemExit(2) elif filter_patterns_str: filter_patterns = filter_patterns_str.split('\n') links = list_links( filter_patterns=filter_patterns, filter_type=filter_type, before=before, after=after, ) if sort: links = sorted(links, key=lambda link: getattr(link, sort)) folders = list_folders( links=list(links), status=status, out_dir=out_dir, ) print(printable_folders(folders, json=json, csv=csv)) return folders
[docs]@enforce_types def list_folders(links: List[Link], status: str, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: check_data_folder(out_dir=out_dir) if status == 'indexed': return get_indexed_folders(links, out_dir=out_dir) elif status == 'archived': return get_archived_folders(links, out_dir=out_dir) elif status == 'unarchived': return get_unarchived_folders(links, out_dir=out_dir) elif status == 'present': return get_present_folders(links, out_dir=out_dir) elif status == 'valid': return get_valid_folders(links, out_dir=out_dir) elif status == 'invalid': return get_invalid_folders(links, out_dir=out_dir) elif status == 'duplicate': return get_duplicate_folders(links, out_dir=out_dir) elif status == 'orphaned': return get_orphaned_folders(links, out_dir=out_dir) elif status == 'corrupted': return get_corrupted_folders(links, out_dir=out_dir) elif status == 'unrecognized': return get_unrecognized_folders(links, out_dir=out_dir) raise ValueError('Status not recognized.')
[docs]@enforce_types def config(config_options_str: Optional[str]=None, config_options: Optional[List[str]]=None, get: bool=False, set: bool=False, reset: bool=False, out_dir: str=OUTPUT_DIR) -> None: """Get and set your ArchiveBox project configuration values""" check_data_folder(out_dir=out_dir) if config_options and config_options_str: stderr( '[X] You should either pass config values as an arguments ' 'or via stdin, but not both.\n', color='red', ) raise SystemExit(2) elif config_options_str: config_options = config_options_str.split('\n') config_options = config_options or [] no_args = not (get or set or reset or config_options) matching_config: ConfigDict = {} if get or no_args: if config_options: config_options = [get_real_name(key) for key in config_options] matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG} failed_config = [key for key in config_options if key not in CONFIG] if failed_config: stderr() stderr('[X] These options failed to get', color='red') stderr(' {}'.format('\n '.join(config_options))) raise SystemExit(1) else: matching_config = CONFIG print(printable_config(matching_config)) raise SystemExit(not matching_config) elif set: new_config = {} failed_options = [] for line in config_options: if line.startswith('#') or not line.strip(): continue if '=' not in line: stderr('[X] Config KEY=VALUE must have an = sign in it', color='red') stderr(f' {line}') raise SystemExit(2) raw_key, val = line.split('=') raw_key = raw_key.upper().strip() key = get_real_name(raw_key) if key != raw_key: stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow') if key in CONFIG: new_config[key] = val.strip() else: failed_options.append(line) if new_config: before = CONFIG matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR) after = load_all_config() print(printable_config(matching_config)) side_effect_changes: ConfigDict = {} for key, val in after.items(): if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config): side_effect_changes[key] = after[key] if side_effect_changes: stderr() stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow') print(' {}'.format(printable_config(side_effect_changes, prefix=' '))) if failed_options: stderr() stderr('[X] These options failed to set (check for typos):', color='red') stderr(' {}'.format('\n '.join(failed_options))) raise SystemExit(bool(failed_options)) elif reset: stderr('[X] This command is not implemented yet.', color='red') stderr(' Please manually remove the relevant lines from your config file:') stderr(f' {CONFIG_FILE}') raise SystemExit(2) else: stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') stderr(' archivebox config') stderr(' archivebox config --get SOME_KEY') stderr(' archivebox config --set SOME_KEY=SOME_VALUE') raise SystemExit(2)
[docs]@enforce_types def schedule(add: bool=False, show: bool=False, clear: bool=False, foreground: bool=False, run_all: bool=False, quiet: bool=False, every: Optional[str]=None, depth: int=0, import_path: Optional[str]=None, out_dir: str=OUTPUT_DIR): """Set ArchiveBox to regularly import URLs at specific times using cron""" check_data_folder(out_dir=out_dir) os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True) cron = CronTab(user=True) cron = dedupe_cron_jobs(cron) if clear: print(cron.remove_all(comment=CRON_COMMENT)) cron.write() raise SystemExit(0) existing_jobs = list(cron.find_comment(CRON_COMMENT)) if every or add: every = every or 'day' quoted = lambda s: f'"{s}"' if s and ' ' in s else s cmd = [ 'cd', quoted(out_dir), '&&', quoted(ARCHIVEBOX_BINARY), *(['add', f'--depth={depth}', f'"{import_path}"'] if import_path else ['update']), '>', quoted(os.path.join(LOGS_DIR, 'archivebox.log')), '2>&1', ] new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) if every in ('minute', 'hour', 'day', 'month', 'year'): set_every = getattr(new_job.every(), every) set_every() elif CronSlices.is_valid(every): new_job.setall(every) else: stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI)) stderr(' It must be one of minute/hour/day/month') stderr(' or a quoted cron-format schedule like:') stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml') stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml') raise SystemExit(1) cron = dedupe_cron_jobs(cron) cron.write() total_runs = sum(j.frequency_per_year() for j in cron) existing_jobs = list(cron.find_comment(CRON_COMMENT)) print() print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI)) print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) if total_runs > 60 and not quiet: stderr() stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI)) stderr(' Congrats on being an enthusiastic internet archiver! πŸ‘Œ') stderr() stderr(' Make sure you have enough storage space available to hold all the data.') stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') stderr('') elif show: if existing_jobs: print('\n'.join(str(cmd) for cmd in existing_jobs)) else: stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI)) stderr(' To schedule a new job, run:') stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml') raise SystemExit(0) cron = CronTab(user=True) cron = dedupe_cron_jobs(cron) existing_jobs = list(cron.find_comment(CRON_COMMENT)) if foreground or run_all: if not existing_jobs: stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI)) stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml') raise SystemExit(1) print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI)) if run_all: try: for job in existing_jobs: sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n') sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" > ")[0]}') sys.stdout.flush() job.run() sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n') except KeyboardInterrupt: print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) raise SystemExit(1) if foreground: try: for job in existing_jobs: print(f' > {job.command.split("/archivebox ")[-1].split(" > ")[0]}') for result in cron.run_scheduler(): print(result) except KeyboardInterrupt: print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) raise SystemExit(1)
[docs]@enforce_types def server(runserver_args: Optional[List[str]]=None, reload: bool=False, debug: bool=False, init: bool=False, out_dir: str=OUTPUT_DIR) -> None: """Run the ArchiveBox HTTP server""" runserver_args = runserver_args or [] if init: run_subcommand('init', stdin=None, pwd=out_dir) # setup config for django runserver from . import config config.SHOW_PROGRESS = False config.DEBUG = config.DEBUG or debug check_data_folder(out_dir=out_dir) setup_django(out_dir) from django.core.management import call_command from django.contrib.auth.models import User admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last() print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI)) if admin_user: hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI)) else: print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI)) print() print(' To create an admin user, run:') print(' archivebox manage createsuperuser') print() # fallback to serving staticfiles insecurely with django when DEBUG=False if not config.DEBUG: runserver_args.append('--insecure') # TODO: serve statics w/ nginx instead # toggle autoreloading when archivebox code changes (it's on by default) if not reload: runserver_args.append('--noreload') config.SHOW_PROGRESS = False config.DEBUG = config.DEBUG or debug call_command("runserver", *runserver_args)
[docs]@enforce_types def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None: """Run an ArchiveBox Django management command""" check_data_folder(out_dir=out_dir) setup_django(out_dir) from django.core.management import execute_from_command_line if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY): stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow') stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') stderr() execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
[docs]@enforce_types def shell(out_dir: str=OUTPUT_DIR) -> None: """Enter an interactive ArchiveBox Django shell""" check_data_folder(out_dir=out_dir) setup_django(OUTPUT_DIR) from django.core.management import call_command call_command("shell_plus")