Source code for archivebox.index.json

__package__ = 'archivebox.index'

import os
import sys
import json as pyjson

from datetime import datetime
from typing import List, Optional, Iterator, Any

from .schema import Link, ArchiveResult
from ..system import atomic_write
from ..util import enforce_types
from ..config import (
    VERSION,
    OUTPUT_DIR,
    FOOTER_INFO,
    GIT_SHA,
    DEPENDENCIES,
    JSON_INDEX_FILENAME,
    ARCHIVE_DIR_NAME,
)


MAIN_INDEX_HEADER = {
    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
    'schema': 'archivebox.index.json',
    'copyright_info': FOOTER_INFO,
    'meta': {
        'project': 'ArchiveBox',
        'version': VERSION,
        'git_sha': GIT_SHA,
        'website': 'https://ArchiveBox.io',
        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
        'source': 'https://github.com/pirate/ArchiveBox',
        'issues': 'https://github.com/pirate/ArchiveBox/issues',
        'dependencies': DEPENDENCIES,
    },
}


### Main Links Index

[docs]@enforce_types def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: """parse an archive index json file and return the list of links""" index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: links = pyjson.load(f)['links'] for link_json in links: yield Link.from_json(link_json) return ()
[docs]@enforce_types def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: """write the json link index to a given path""" assert isinstance(links, List), 'Links must be a list, not a generator.' assert not links or isinstance(links[0].history, dict) assert not links or isinstance(links[0].sources, list) if links and links[0].history.get('title'): assert isinstance(links[0].history['title'][0], ArchiveResult) if links and links[0].sources: assert isinstance(links[0].sources[0], str) main_index_json = { **MAIN_INDEX_HEADER, 'num_links': len(links), 'updated': datetime.now(), 'last_run_cmd': sys.argv, 'links': links, } atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
### Link Details Index ### Helpers
[docs]class ExtendedEncoder(pyjson.JSONEncoder): """ Extended json serializer that supports serializing several model fields and objects """
[docs] def default(self, obj): cls_name = obj.__class__.__name__ if hasattr(obj, '_asdict'): return obj._asdict() elif isinstance(obj, bytes): return obj.decode() elif isinstance(obj, datetime): return obj.isoformat() elif isinstance(obj, Exception): return '{}: {}'.format(obj.__class__.__name__, obj) elif cls_name in ('dict_items', 'dict_keys', 'dict_values'): return tuple(obj) return pyjson.JSONEncoder.default(self, obj)
[docs]@enforce_types def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str: return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)