Source code for archivebox.index

__package__ = 'archivebox.index'

import re
import os
import shutil
import json as pyjson

from itertools import chain
from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
from urllib.parse import urlparse

from ..system import atomic_write
from ..util import (
    scheme,
    enforce_types,
    ExtendedEncoder,
)
from ..config import (
    ARCHIVE_DIR_NAME,
    SQL_INDEX_FILENAME,
    JSON_INDEX_FILENAME,
    HTML_INDEX_FILENAME,
    OUTPUT_DIR,
    TIMEOUT,
    URL_BLACKLIST_PTN,
    ANSI,
    stderr,
    OUTPUT_PERMISSIONS
)
from ..logging_util import (
    TimedProgress,
    log_indexing_process_started,
    log_indexing_process_finished,
    log_indexing_started,
    log_indexing_finished,
    log_parsing_finished,
    log_deduping_finished,
)

from .schema import Link, ArchiveResult
from .html import (
    write_html_main_index,
    write_html_link_details,
)
from .json import (
    parse_json_main_index,
    write_json_main_index,
    parse_json_link_details, 
    write_json_link_details,
)
from .sql import (
    write_sql_main_index,
    parse_sql_main_index,
    write_sql_link_details,
)

### Link filtering and checking



















[docs]@enforce_types def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" timestamp = timestamp.split('.')[0] nonce = 0 # first try 152323423 before 152323423.0 if timestamp not in used_timestamps: return timestamp new_timestamp = '{}.{}'.format(timestamp, nonce) while new_timestamp in used_timestamps: nonce += 1 new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp
### Main Links Index
[docs]@contextmanager @enforce_types def timed_index_update(out_path: str): log_indexing_started(out_path) timer = TimedProgress(TIMEOUT * 2, prefix=' ') try: yield finally: timer.end() assert os.path.exists(out_path), f'Failed to write index file: {out_path}' log_indexing_finished(out_path)
[docs]@enforce_types def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: """create index.html file for a given list of links""" log_indexing_process_started(len(links)) try: with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)): write_sql_main_index(links, out_dir=out_dir) os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): write_json_main_index(links, out_dir=out_dir) with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)): write_html_main_index(links, out_dir=out_dir, finished=finished) except (KeyboardInterrupt, SystemExit): stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.') with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)): write_sql_main_index(links, out_dir=out_dir) os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes raise SystemExit(0) log_indexing_process_finished()
[docs]@enforce_types def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" all_links: List[Link] = [] try: all_links = list(parse_json_main_index(out_dir)) links_from_sql = list(parse_sql_main_index(out_dir)) json_urls = set(l.url for l in all_links) sql_urls = set(l.url for l in links_from_sql) only_in_sql = sql_urls - json_urls only_in_json = json_urls - sql_urls if only_in_json: stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI)) if only_in_json: stderr(' > Only in JSON: {}...'.format(', '.join(list(only_in_json)[:5]))) if only_in_sql: stderr(' > Only in SQL: {}...'.format(', '.join(list(only_in_sql)[:5]))) stderr(' To repair the index and re-import any orphaned links run:') stderr(' archivebox init') if only_in_sql: # meh, this harmless, it'll get overwritten on next run anyway pass except (KeyboardInterrupt, SystemExit): raise SystemExit(0) return all_links
[docs]@enforce_types def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]: index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: meta_dict = pyjson.load(f) meta_dict.pop('links') return meta_dict return None
[docs]@enforce_types def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None: """hack to in-place update one row's info in the generated index files""" # TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous title = link.title or link.latest_outputs(status='succeeded')['title'] successful = link.num_outputs # Patch JSON main index json_file_links = parse_json_main_index(out_dir) patched_links = [] for saved_link in json_file_links: if saved_link.url == link.url: patched_links.append(saved_link.overwrite( title=title, history=link.history, updated=link.updated, )) else: patched_links.append(saved_link) write_json_main_index(patched_links, out_dir=out_dir) # Patch HTML main index html_path = os.path.join(out_dir, 'index.html') with open(html_path, 'r') as f: html = f.read().splitlines() for idx, line in enumerate(html): if title and ('<span data-title-for="{}"'.format(link.url) in line): html[idx] = '<span>{}</span>'.format(title) elif successful and ('<span data-number-for="{}"'.format(link.url) in line): html[idx] = '<span>{}</span>'.format(successful) break atomic_write(html_path, '\n'.join(html))
### Link Details Index LINK_FILTERS = { 'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern), 'substring': lambda link, pattern: pattern in link.url, 'regex': lambda link, pattern: bool(re.match(pattern, link.url)), 'domain': lambda link, pattern: link.domain == pattern, }
[docs]def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" return { link.link_dir: link for link in links }
[docs]def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" return { link.link_dir: link for link in filter(is_archived, links) }
[docs]def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" return { link.link_dir: link for link in filter(is_unarchived, links) }
[docs]def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that actually exist in the archive/ folder""" all_folders = {} for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): if entry.is_dir(follow_symlinks=True): link = None try: link = parse_json_link_details(entry.path) except Exception: pass all_folders[entry.path] = link return all_folders
[docs]def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" return { link.link_dir: link for link in filter(is_valid, links) }
[docs]def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR) orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR) corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR) unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR) return {**duplicate, **orphaned, **corrupted, **unrecognized}
[docs]def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that conflict with other directories that have the same link URL or timestamp""" links = list(links) by_url = {link.url: 0 for link in links} by_timestamp = {link.timestamp: 0 for link in links} duplicate_folders = {} indexed_folders = {link.link_dir for link in links} data_folders = ( entry.path for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)) if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders ) for path in chain(sorted(indexed_folders), sorted(data_folders)): link = None try: link = parse_json_link_details(path) except Exception: pass if link: # link folder has same timestamp as different link folder by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1 if by_timestamp[link.timestamp] > 1: duplicate_folders[path] = link # link folder has same url as different link folder by_url[link.url] = by_url.get(link.url, 0) + 1 if by_url[link.url] > 1: duplicate_folders[path] = link return duplicate_folders
[docs]def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that contain a valid index but aren't listed in the main index""" links = list(links) indexed_folders = {link.link_dir: link for link in links} orphaned_folders = {} for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): if entry.is_dir(follow_symlinks=True): link = None try: link = parse_json_link_details(entry.path) except Exception: pass if link and entry.path not in indexed_folders: # folder is a valid link data dir with index details, but it's not in the main index orphaned_folders[entry.path] = link return orphaned_folders
[docs]def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that don't contain a valid index and aren't listed in the main index""" return { link.link_dir: link for link in filter(is_corrupt, links) }
[docs]def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that don't contain recognizable archive data and aren't listed in the main index""" by_timestamp = {link.timestamp: 0 for link in links} unrecognized_folders: Dict[str, Optional[Link]] = {} for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): if entry.is_dir(follow_symlinks=True): index_exists = os.path.exists(os.path.join(entry.path, 'index.json')) link = None try: link = parse_json_link_details(entry.path) except KeyError: # Try to fix index if index_exists: try: # Last attempt to repair the detail index link_guessed = parse_json_link_details(entry.path, guess=True) write_json_link_details(link_guessed, out_dir=entry.path) link = parse_json_link_details(entry.path) except Exception: pass if index_exists and link is None: # index exists but it's corrupted or unparseable unrecognized_folders[entry.path] = link elif not index_exists: # link details index doesn't exist and the folder isn't in the main index timestamp = entry.path.rsplit('/', 1)[-1] if timestamp not in by_timestamp: unrecognized_folders[entry.path] = link return unrecognized_folders
[docs]def is_valid(link: Link) -> bool: dir_exists = os.path.exists(link.link_dir) index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json')) if not dir_exists: # unarchived links are not included in the valid list return False if dir_exists and not index_exists: return False if dir_exists and index_exists: try: parsed_link = parse_json_link_details(link.link_dir, guess=True) return link.url == parsed_link.url except Exception: pass return False
[docs]def is_corrupt(link: Link) -> bool: if not os.path.exists(link.link_dir): # unarchived links are not considered corrupt return False if is_valid(link): return False return True
[docs]def is_archived(link: Link) -> bool: return is_valid(link) and link.is_archived
[docs]def is_unarchived(link: Link) -> bool: if not os.path.exists(link.link_dir): return True return not link.is_archived
[docs]def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]: fixed = [] cant_fix = [] for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): if entry.is_dir(follow_symlinks=True): if os.path.exists(os.path.join(entry.path, 'index.json')): try: link = parse_json_link_details(entry.path) except KeyError: link = None if not link: continue if not entry.path.endswith(f'/{link.timestamp}'): dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp) if os.path.exists(dest): cant_fix.append(entry.path) else: shutil.move(entry.path, dest) fixed.append(dest) timestamp = entry.path.rsplit('/', 1)[-1] assert link.link_dir == entry.path assert link.timestamp == timestamp write_json_link_details(link, out_dir=entry.path) return fixed, cant_fix