Source code for chat_archive.html

# Easy to use offline chat archive.
#
# Author: Peter Odding <peter@peterodding.com>
# Last Change: August 1, 2018
# URL: https://github.com/xolox/python-chat-archive

"""Utility functions for working with the HTML encoded text."""

# Standard library modules.
import html
import html.entities
import html.parser
import io
import re

# External dependencies.
from humanfriendly.text import compact_empty_lines
from verboselogs import VerboseLogger

# Public identifiers that require documentation.
__all__ = (
    "BLOCK_TAGS",
    "HTMLStripper",
    "URL_PATTERN",
    "html_to_text",
    "text_to_html",
)

BLOCK_TAGS = ["div", "p", "pre"]
"""
A list of strings with HTML tags that are considered block-level elements. The
:class:`HTMLStripper` emits an empty line before and after each block-level
element that it encounters.
"""

URL_PATTERN = re.compile("(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)")
"""
A compiled regular expression pattern to find URLs in text
(credit: taken from `urlregex.com <http://urlregex.com/>`_).
"""

# Initialize a logger for this module.
logger = VerboseLogger(__name__)


[docs]def html_to_text(html_text):
    """
    Convert HTML to plain text.

    :param html_text: A fragment of HTML (a string).
    :returns: The plain text (a string).

    This function uses the :class:`HTMLStripper` class that builds on top of
    the :class:`html.parser.HTMLParser` class in the Python standard library.
    """
    parser = HTMLStripper()
    parser.feed(html_text)
    parser.close()
    return parser.output.getvalue()


[docs]def text_to_html(text, callback=None):
    """
    Convert plain text to HTML.

    :param text: A fragment of plain text (a string).
    :param callback: An optional callback that provides the caller a chance
                     to pre-process text before it is encoded as HTML.
    :returns: The HTML encoded text (a string).

    This function replaces URLs with ``<a href="...">`` tags
    and escapes special characters, that's it, nothing more.
    """
    as_html = []
    for token in URL_PATTERN.split(text):
        if URL_PATTERN.match(token):
            href = html.escape(token, quote=True)
            text = html.escape(token, quote=False)
            as_html.append('<a href="%s">%s</a>' % (href, text))
        else:
            if callback:
                token = callback(token)
            as_html.append(html.escape(token, quote=False))
    return "".join(as_html)


[docs]class HTMLStripper(html.parser.HTMLParser):

    """A simple HTML to text converter based on :class:`html.parser.HTMLParser`."""

[docs]    def __call__(self, data):
        """
        Convert HTML to text.

        :param data: The HTML to convert to text (a string).
        :returns: The converted text (a string).

        This method calls :func:`~humanfriendly.text.compact_empty_lines()`
        on the converted text to normalize superfluous empty lines caused
        by vertical whitespace emitted around block level elements like
        ``<div>``, ``<p>`` and ``<pre>``.
        """
        self.reset()
        self.feed(data)
        self.close()
        text = self.output.getvalue()
        return compact_empty_lines(text)

[docs]    def handle_charref(self, value):
        """
        Process a decimal or hexadecimal numeric character reference.

        :param value: The decimal or hexadecimal value (a string).
        """
        self.output.write(chr(int(value[1:], 16) if value.startswith("x") else int(value)))

[docs]    def handle_data(self, data):
        """Capture decoded text data."""
        self.output.write(data)

[docs]    def handle_endtag(self, tag):
        """Emit empty lines around block level elements."""
        if tag in BLOCK_TAGS:
            self.output.write("\n\n")

[docs]    def handle_entityref(self, name):
        """
        Process a named character reference.

        :param name: The name of the character reference (a string).
        """
        self.output.write(chr(html.entities.name2codepoint[name]))

[docs]    def handle_starttag(self, tag, attrs):
        """Translate ``<br>`` tags to line breaks."""
        if tag == "br":
            self.output.write("\n")
        elif tag in BLOCK_TAGS:
            self.output.write("\n\n")

[docs]    def reset(self):
        """Reset the state of the :class:`HTMLStripper` instance."""
        # Reset the state of the superclass.
        super(HTMLStripper, self).reset()
        # Reset our instance variables.
        self.output = io.StringIO()