archivebox.core.models

Module Contents

Classes

Tag

Old tag model, loosely based on django-taggit model + ABID base.

SnapshotTag

SnapshotManager

Snapshot

ArchiveResultManager

ArchiveResult

Functions

validate_timestamp

API

class archivebox.core.models.Tag(*args: Any, **kwargs: Any)[source]

Bases: archivebox.base_models.models.ModelWithReadOnlyFields, archivebox.base_models.models.ModelWithSerializers, archivebox.base_models.models.ModelWithUUID, archivebox.base_models.models.ABIDModel

Old tag model, loosely based on django-taggit model + ABID base.

Being phazed out in favor of archivebox.tags.models.ATag

Initialization

Overriden init method ensures we have a stable creation timestamp that fields can use within initialization code pre-saving to DB.

abid_prefix[source]

‘tag_’

abid_ts_src[source]

‘self.created_at’

abid_uri_src[source]

‘self.slug’

abid_subtype_src[source]

‘“03”’

abid_rand_src[source]

self.id

abid_drift_allowed[source]

True

read_only_fields[source]

(‘id’, ‘abid’, ‘created_at’, ‘created_by’, ‘slug’)

id[source]

‘UUIDField(…)’

abid[source]

‘ABIDField(…)’

created_by[source]

‘ForeignKey(…)’

created_at[source]

‘AutoDateTimeField(…)’

modified_at[source]

‘DateTimeField(…)’

name[source]

‘CharField(…)’

slug[source]

‘SlugField(…)’

snapshot_set: django.db.models.Manager[Snapshot][source]

None

class Meta[source]

Bases: django_stubs_ext.db.models.TypedModelMeta

verbose_name[source]

‘Tag’

verbose_name_plural[source]

‘Tags’

__str__()[source]
slugify(tag, i=None)[source]
clean(*args, **kwargs)[source]
save(*args, **kwargs)[source]
property api_url: str[source]
property api_docs_url: str[source]
class archivebox.core.models.SnapshotTag[source]

Bases: django.db.models.Model

id[source]

‘AutoField(…)’

snapshot[source]

‘ForeignKey(…)’

tag[source]

‘ForeignKey(…)’

class Meta[source]
db_table[source]

‘core_snapshot_tags’

unique_together[source]

[(‘snapshot’, ‘tag’)]

archivebox.core.models.validate_timestamp(value)[source]
class archivebox.core.models.SnapshotManager[source]

Bases: django.db.models.Manager

filter(*args, **kwargs)[source]

add support for .filter(domain=’example.com’) to Snapshot queryset

get_queryset()[source]
class archivebox.core.models.Snapshot(*args: Any, **kwargs: Any)[source]

Bases: archivebox.base_models.models.ModelWithReadOnlyFields, archivebox.base_models.models.ModelWithSerializers, archivebox.base_models.models.ModelWithUUID, archivebox.base_models.models.ModelWithKVTags, archivebox.base_models.models.ABIDModel, archivebox.base_models.models.ModelWithOutputDir, archivebox.base_models.models.ModelWithConfig, archivebox.base_models.models.ModelWithNotes, archivebox.base_models.models.ModelWithHealthStats, workers.models.ModelWithStateMachine

read_only_fields[source]

(‘id’, ‘abid’, ‘created_at’, ‘created_by_id’, ‘url’, ‘timestamp’, ‘bookmarked_at’, ‘crawl_id’)

id[source]

‘UUIDField(…)’

abid[source]

‘ABIDField(…)’

created_by[source]

‘ForeignKey(…)’

created_at[source]

‘AutoDateTimeField(…)’

url[source]

‘URLField(…)’

timestamp[source]

‘CharField(…)’

bookmarked_at[source]

‘AutoDateTimeField(…)’

crawl: crawls.models.Crawl[source]

‘ForeignKey(…)’

title[source]

‘CharField(…)’

downloaded_at[source]

‘DateTimeField(…)’

modified_at[source]

‘DateTimeField(…)’

retry_at[source]

‘RetryAtField(…)’

status[source]

‘StatusField(…)’

config[source]

‘JSONField(…)’

notes[source]

‘TextField(…)’

output_dir[source]

‘FilePathField(…)’

tags[source]

‘ManyToManyField(…)’

abid_prefix[source]

‘snp_’

abid_ts_src[source]

‘self.created_at’

abid_uri_src[source]

‘self.url’

abid_subtype_src[source]

‘“01”’

abid_rand_src[source]

self.id

abid_drift_allowed[source]

True

state_machine_name[source]

‘core.statemachines.SnapshotMachine’

state_field_name[source]

‘status’

retry_at_field_name[source]

‘retry_at’

StatusChoices[source]

None

active_state[source]

None

objects[source]

‘SnapshotManager(…)’

archiveresult_set: django.db.models.Manager[ArchiveResult][source]

None

save(*args, **kwargs)[source]
output_dir_parent() str[source]
output_dir_name() str[source]
archive(overwrite=False, methods=None)[source]
__repr__() str[source]
__str__() str[source]
classmethod from_json(fields: dict[str, Any]) Self[source]
as_json(*args, **kwargs) dict[source]
tags_str(nocache=True) str | None[source]
icons() str[source]
property api_url: str[source]
property api_docs_url: str[source]
get_absolute_url()[source]
title_stripped() str[source]
extension() str[source]
bookmarked()[source]
bookmarked_date()[source]
domain() str[source]
is_archived()[source]
num_outputs() int[source]
base_url()[source]
archive_path()[source]
archive_size()[source]
thumbnail_url() Optional[str][source]
headers() Optional[Dict[str, str]][source]
status_code() Optional[str][source]
history() dict[source]
latest_title() Optional[str][source]
save_tags(tags: Iterable[str] = ()) None[source]
pending_archiveresults() django.db.models.QuerySet[archivebox.core.models.ArchiveResult][source]
create_pending_archiveresults() list[archivebox.core.models.ArchiveResult][source]
class archivebox.core.models.ArchiveResultManager[source]

Bases: django.db.models.Manager

indexable(sorted: bool = True)[source]

Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)

class archivebox.core.models.ArchiveResult(*args: Any, **kwargs: Any)[source]

Bases: archivebox.base_models.models.ModelWithReadOnlyFields, archivebox.base_models.models.ModelWithSerializers, archivebox.base_models.models.ModelWithUUID, archivebox.base_models.models.ModelWithKVTags, archivebox.base_models.models.ABIDModel, archivebox.base_models.models.ModelWithOutputDir, archivebox.base_models.models.ModelWithConfig, archivebox.base_models.models.ModelWithNotes, archivebox.base_models.models.ModelWithHealthStats, workers.models.ModelWithStateMachine

abid_prefix[source]

‘res_’

abid_ts_src[source]

‘self.snapshot.created_at’

abid_uri_src[source]

‘self.snapshot.url’

abid_subtype_src[source]

‘self.extractor’

abid_rand_src[source]

self.id

abid_drift_allowed[source]

True

class StatusChoices[source]

Bases: django.db.models.TextChoices

QUEUED[source]

(‘queued’, ‘Queued’)

STARTED[source]

(‘started’, ‘Started’)

BACKOFF[source]

(‘backoff’, ‘Waiting to retry’)

SUCCEEDED[source]

(‘succeeded’, ‘Succeeded’)

FAILED[source]

(‘failed’, ‘Failed’)

SKIPPED[source]

(‘skipped’, ‘Skipped’)

state_machine_name[source]

‘core.statemachines.ArchiveResultMachine’

retry_at_field_name[source]

‘retry_at’

state_field_name[source]

‘status’

active_state[source]

None

EXTRACTOR_CHOICES[source]

((‘htmltotext’, ‘htmltotext’), (‘git’, ‘git’), (‘singlefile’, ‘singlefile’), (‘media’, ‘media’), (‘a…

read_only_fields[source]

(‘id’, ‘abid’, ‘created_at’, ‘created_by’, ‘snapshot’, ‘extractor’, ‘pwd’)

id[source]

‘UUIDField(…)’

abid[source]

‘ABIDField(…)’

created_by[source]

‘ForeignKey(…)’

created_at[source]

‘AutoDateTimeField(…)’

snapshot: archivebox.core.models.Snapshot[source]

‘ForeignKey(…)’

extractor[source]

‘CharField(…)’

pwd[source]

‘CharField(…)’

cmd[source]

‘JSONField(…)’

modified_at[source]

‘DateTimeField(…)’

cmd_version[source]

‘CharField(…)’

output[source]

‘CharField(…)’

start_ts[source]

‘DateTimeField(…)’

end_ts[source]

‘DateTimeField(…)’

status[source]

‘StatusField(…)’

retry_at[source]

‘RetryAtField(…)’

notes[source]

‘TextField(…)’

output_dir[source]

‘CharField(…)’

iface[source]

‘ForeignKey(…)’

objects[source]

‘ArchiveResultManager(…)’

keys[source]

(‘snapshot_id’, ‘extractor’, ‘cmd’, ‘pwd’, ‘cmd_version’, ‘output’, ‘start_ts’, ‘end_ts’, ‘created_a…

class Meta[source]

Bases: django_stubs_ext.db.models.TypedModelMeta

verbose_name[source]

‘Archive Result’

verbose_name_plural[source]

‘Archive Results Log’

__repr__()[source]
__str__()[source]
save(*args, write_indexes: bool = False, **kwargs)[source]
snapshot_dir()[source]
url()[source]
property api_url: str[source]
property api_docs_url: str[source]
get_absolute_url()[source]
property extractor_module: Any | None[source]
property EXTRACTOR: object[source]
embed_path() str | None[source]

return the actual runtime-calculated path to the file on-disk that should be used for user-facing iframe embeds of this result

legacy_output_path()[source]
output_exists() bool[source]
create_output_dir()[source]
canonical_outputs() Dict[str, Optional[str]][source]

Predict the expected output paths that should be present after archiving

property output_dir_name: str[source]
property output_dir_parent: str[source]
output_files() dict[str, dict][source]
announce_event(output_type: str, event: dict)[source]
events(filter_type: str | None = None) list[dict][source]
write_indexes()[source]

Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend

save_search_index()[source]

Pass any indexable text to the search backend indexer (e.g. sonic, SQLiteFTS5, etc.)