abx_plugin_htmltotext.htmltotext

Module Contents

Classes

HTMLTextExtractor

Functions

get_output_path

should_save_htmltotext

save_htmltotext

extract search-indexing-friendly text from an HTML document

API

abx_plugin_htmltotext.htmltotext.get_output_path()[source]
class abx_plugin_htmltotext.htmltotext.HTMLTextExtractor[source]

Bases: html.parser.HTMLParser

TEXT_ATTRS[source]

[‘alt’, ‘cite’, ‘href’, ‘label’, ‘list’, ‘placeholder’, ‘title’, ‘value’]

NOTEXT_TAGS[source]

[‘script’, ‘style’, ‘template’]

NOTEXT_HREF[source]

[‘data:’, ‘javascript:’, ‘#’]

_is_text_attr(name, value)[source]
_parent_tag()[source]
_in_notext_tag()[source]
handle_starttag(tag, attrs)[source]
handle_endtag(tag)[source]
handle_data(data)[source]
__str__()[source]
abx_plugin_htmltotext.htmltotext.should_save_htmltotext(link: archivebox.index.schema.Link, out_dir: Optional[pathlib.Path] = None, overwrite: Optional[bool] = False) bool[source]
abx_plugin_htmltotext.htmltotext.save_htmltotext(link: archivebox.index.schema.Link, out_dir: Optional[pathlib.Path] = None, timeout: int = ARCHIVING_CONFIG.TIMEOUT) archivebox.index.schema.ArchiveResult[source]

extract search-indexing-friendly text from an HTML document