Source code for archivebox.index.schema

"""

WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.

DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py

"""

__package__ = 'archivebox.index'

from pathlib import Path

from datetime import datetime, timezone, timedelta

from typing import List, Dict, Any, Optional, Union

from dataclasses import dataclass, asdict, field, fields

from django.utils.functional import cached_property

from ..system import get_dir_size
from ..util import ts_to_date_str, parse_date
from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME, FAVICON_PROVIDER

[docs] class ArchiveError(Exception): def __init__(self, message, hints=None): super().__init__(message) self.hints = hints
LinkDict = Dict[str, Any] ArchiveOutput = Union[str, Exception, None]
[docs] @dataclass(frozen=True) class ArchiveResult: cmd: List[str] pwd: Optional[str] cmd_version: Optional[str] output: ArchiveOutput status: str start_ts: datetime end_ts: datetime index_texts: Union[List[str], None] = None schema: str = 'ArchiveResult' def __post_init__(self): self.typecheck() def _asdict(self): return asdict(self)
[docs] def typecheck(self) -> None: assert self.schema == self.__class__.__name__ assert isinstance(self.status, str) and self.status assert isinstance(self.start_ts, datetime) assert isinstance(self.end_ts, datetime) assert isinstance(self.cmd, list) assert all(isinstance(arg, str) and arg for arg in self.cmd) # TODO: replace emptystrings in these three with None / remove them from the DB assert self.pwd is None or isinstance(self.pwd, str) assert self.cmd_version is None or isinstance(self.cmd_version, str) assert self.output is None or isinstance(self.output, (str, Exception))
[docs] @classmethod def guess_ts(_cls, dict_info): from ..util import parse_date parsed_timestamp = parse_date(dict_info["timestamp"]) start_ts = parsed_timestamp end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"])) return start_ts, end_ts
[docs] @classmethod def from_json(cls, json_info, guess=False): from ..util import parse_date info = { key: val for key, val in json_info.items() if key in cls.field_names() } if guess: keys = info.keys() if "start_ts" not in keys: info["start_ts"], info["end_ts"] = cls.guess_ts(json_info) else: info['start_ts'] = parse_date(info['start_ts']) info['end_ts'] = parse_date(info['end_ts']) if "pwd" not in keys: info["pwd"] = str(Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / json_info["timestamp"]) if "cmd_version" not in keys: info["cmd_version"] = "Undefined" if "cmd" not in keys: info["cmd"] = [] else: info['start_ts'] = parse_date(info['start_ts']) info['end_ts'] = parse_date(info['end_ts']) info['cmd_version'] = info.get('cmd_version') if type(info["cmd"]) is str: info["cmd"] = [info["cmd"]] return cls(**info)
[docs] def to_dict(self, *keys) -> dict: if keys: return {k: v for k, v in asdict(self).items() if k in keys} return asdict(self)
[docs] def to_json(self, indent=4, sort_keys=True) -> str: from .json import to_json return to_json(self, indent=indent, sort_keys=sort_keys)
[docs] def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str: from .csv import to_csv return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
[docs] @classmethod def field_names(cls): return [f.name for f in fields(cls)]
@property def duration(self) -> int: return (self.end_ts - self.start_ts).seconds