Source code for archivebox.index.json

__package__ = 'archivebox.index'

import os
import sys
import json as pyjson
from pathlib import Path

from datetime import datetime, timezone
from typing import List, Optional, Iterator, Any, Union

from .schema import Link
from ..system import atomic_write
from ..util import enforce_types
from ..config import (
    VERSION,
    OUTPUT_DIR,
    FOOTER_INFO,
    DEPENDENCIES,
    JSON_INDEX_FILENAME,
    ARCHIVE_DIR_NAME,
    ANSI
)


MAIN_INDEX_HEADER = {
    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
    'schema': 'archivebox.index.json',
    'copyright_info': FOOTER_INFO,
    'meta': {
        'project': 'ArchiveBox',
        'version': VERSION,
        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
        'website': 'https://ArchiveBox.io',
        'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
        'source': 'https://github.com/ArchiveBox/ArchiveBox',
        'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
        'dependencies': DEPENDENCIES,
    },
}





[docs] @enforce_types def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: """parse an archive index json file and return the list of links""" index_path = Path(out_dir) / JSON_INDEX_FILENAME if index_path.exists(): with open(index_path, 'r', encoding='utf-8') as f: try: links = pyjson.load(f)['links'] if links: Link.from_json(links[0]) except Exception as err: print(" {lightyellow}! Found an index.json in the project root but couldn't load links from it: {} {}".format( err.__class__.__name__, err, **ANSI, )) return () for link_json in links: try: yield Link.from_json(link_json) except KeyError: try: detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp'] yield parse_json_link_details(str(detail_index_path)) except KeyError: # as a last effort, try to guess the missing values out of existing ones try: yield Link.from_json(link_json, guess=True) except KeyError: print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI)) continue return ()
### Link Details Index ### Helpers
[docs] class ExtendedEncoder(pyjson.JSONEncoder): """ Extended json serializer that supports serializing several model fields and objects """
[docs] def default(self, obj): cls_name = obj.__class__.__name__ if hasattr(obj, '_asdict'): return obj._asdict() elif isinstance(obj, bytes): return obj.decode() elif isinstance(obj, datetime): return obj.isoformat() elif isinstance(obj, Exception): return '{}: {}'.format(obj.__class__.__name__, obj) elif cls_name in ('dict_items', 'dict_keys', 'dict_values'): return tuple(obj) return pyjson.JSONEncoder.default(self, obj)
[docs] @enforce_types def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str: return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)