Source code for chat_archive.html

# Easy to use offline chat archive.
#
# Author: Peter Odding <peter@peterodding.com>
# Last Change: August 1, 2018
# URL: https://github.com/xolox/python-chat-archive

"""Utility functions for working with the HTML encoded text."""

# Standard library modules.
import html
import html.entities
import html.parser
import io
import re

# External dependencies.
from humanfriendly.text import compact_empty_lines
from verboselogs import VerboseLogger

# Public identifiers that require documentation.
__all__ = (
    "BLOCK_TAGS",
    "HTMLStripper",
    "URL_PATTERN",
    "html_to_text",
    "text_to_html",
)

BLOCK_TAGS = ["div", "p", "pre"]
"""
A list of strings with HTML tags that are considered block-level elements. The
:class:`HTMLStripper` emits an empty line before and after each block-level
element that it encounters.
"""

URL_PATTERN = re.compile("(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)")
"""
A compiled regular expression pattern to find URLs in text
(credit: taken from `urlregex.com <http://urlregex.com/>`_).
"""

# Initialize a logger for this module.
logger = VerboseLogger(__name__)


[docs]def html_to_text(html_text): """ Convert HTML to plain text. :param html_text: A fragment of HTML (a string). :returns: The plain text (a string). This function uses the :class:`HTMLStripper` class that builds on top of the :class:`html.parser.HTMLParser` class in the Python standard library. """ parser = HTMLStripper() parser.feed(html_text) parser.close() return parser.output.getvalue()
[docs]def text_to_html(text, callback=None): """ Convert plain text to HTML. :param text: A fragment of plain text (a string). :param callback: An optional callback that provides the caller a chance to pre-process text before it is encoded as HTML. :returns: The HTML encoded text (a string). This function replaces URLs with ``<a href="...">`` tags and escapes special characters, that's it, nothing more. """ as_html = [] for token in URL_PATTERN.split(text): if URL_PATTERN.match(token): href = html.escape(token, quote=True) text = html.escape(token, quote=False) as_html.append('<a href="%s">%s</a>' % (href, text)) else: if callback: token = callback(token) as_html.append(html.escape(token, quote=False)) return "".join(as_html)
[docs]class HTMLStripper(html.parser.HTMLParser): """A simple HTML to text converter based on :class:`html.parser.HTMLParser`."""
[docs] def __call__(self, data): """ Convert HTML to text. :param data: The HTML to convert to text (a string). :returns: The converted text (a string). This method calls :func:`~humanfriendly.text.compact_empty_lines()` on the converted text to normalize superfluous empty lines caused by vertical whitespace emitted around block level elements like ``<div>``, ``<p>`` and ``<pre>``. """ self.reset() self.feed(data) self.close() text = self.output.getvalue() return compact_empty_lines(text)
[docs] def handle_charref(self, value): """ Process a decimal or hexadecimal numeric character reference. :param value: The decimal or hexadecimal value (a string). """ self.output.write(chr(int(value[1:], 16) if value.startswith("x") else int(value)))
[docs] def handle_data(self, data): """Capture decoded text data.""" self.output.write(data)
[docs] def handle_endtag(self, tag): """Emit empty lines around block level elements.""" if tag in BLOCK_TAGS: self.output.write("\n\n")
[docs] def handle_entityref(self, name): """ Process a named character reference. :param name: The name of the character reference (a string). """ self.output.write(chr(html.entities.name2codepoint[name]))
[docs] def handle_starttag(self, tag, attrs): """Translate ``<br>`` tags to line breaks.""" if tag == "br": self.output.write("\n") elif tag in BLOCK_TAGS: self.output.write("\n\n")
[docs] def reset(self): """Reset the state of the :class:`HTMLStripper` instance.""" # Reset the state of the superclass. super(HTMLStripper, self).reset() # Reset our instance variables. self.output = io.StringIO()