From 00e12ccec1b44856669ed9b96aa4333ea2c24c95 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 19 Sep 2011 22:41:46 +0200 Subject: Remove Elinks support and add a pure python XHTML/CSS parser. --- src/xhtml.py | 200 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 117 insertions(+), 83 deletions(-) (limited to 'src') diff --git a/src/xhtml.py b/src/xhtml.py index f8272e4f..8f629e3b 100644 --- a/src/xhtml.py +++ b/src/xhtml.py @@ -38,51 +38,126 @@ def get_body_from_message_stanza(message): if config.get('enable_xhtml_im', 'true') == 'true': xhtml_body = message['xhtml_im'] if xhtml_body: - xhtml_body = convert_links_to_plaintext(xhtml_body) - try: - shell_body = xhtml_code_to_shell_colors(xhtml_body) - except OSError: - log.debug('html parsing failed') - else: - return shell_colors_to_poezio_colors(shell_body) + return xhtml_to_poezio_colors(xhtml_body) return message['body'] -def convert_links_to_plaintext(text): - """ - Replace - click - by - (click) - in plain text - """ + +def xhtml_to_poezio_colors(text): + def parse_css(css): + def get_color(string): + if value == 'black': + return 0 + if value == 'red': + return 1 + if value == 'green': + return 2 + if value == 'yellow': + return 3 + if value == 'blue': + return 4 + if value == 'magenta': + return 5 + if value == 'cyan': + return 6 + if value == 'white': + return 7 + if value == 'default': + return 8 + shell = '' + rules = css.split(';') + for rule in rules: + key, value = rule.split(':', 1) + key = key.strip() + value = value.strip() + log.debug(value) + if key == 'background-color': + pass#shell += '\x191' + elif key == 'color': + shell += '\x19%d' % get_color(value) + elif key == 'font-style': + shell += '\x19i' + elif key == 'font-weight': + shell += '\x19b' + elif key == 'margin-left': + shell += ' ' + elif key == 'text-align': + pass + elif key == 'text-decoration': + if value == 'underline': + shell += '\x19u' + elif value == 'blink': + shell += '\x19a' + return shell + log.debug(text) - xml = ElementTree(ET.fromstring(text)) - for parent in xml.getiterator(): - previous_child = None - for child in parent: - if child.tag == '{http://www.w3.org/1999/xhtml}a': - if child.attrib['href'] != child.text: - if child.text is None and 'title' in child.attrib: - link_text = '\n%s (%s)' % (child.attrib['href'], child.attrib['title']) - else: - link_text = '\n%s (%s)' % (child.attrib['href'], child.text) - else: - link_text = child.text - if previous_child is not None: - if previous_child.tail is None: - previous_child.tail = link_text - else: - previous_child.tail += link_text - else: - if parent.text is None: - parent.text = link_text - else: - parent.text += link_text - parent.remove(child) - previous_child = child - if version_info.minor <= 1: - return ET.tostring(xml.getroot()) - return ET.tostring(xml.getroot(), encoding=str) + xml = ET.fromstring(text) + message = '' + for elem in xml.iter(): + if elem.tag == '{http://www.w3.org/1999/xhtml}a': + if 'href' in elem.attrib and elem.attrib['href'] != elem.text: + message += '\x19u%s\x19o (%s)' % (elem.attrib['href'], elem.text) + else: + message += '\x19u' + elem.text + '\x19o' + elif elem.tag == '{http://www.w3.org/1999/xhtml}blockquote': + message += '“' + elif elem.tag == '{http://www.w3.org/1999/xhtml}body': + pass + elif elem.tag == '{http://www.w3.org/1999/xhtml}br': + message += '\n' + elif elem.tag == '{http://www.w3.org/1999/xhtml}cite': + message += '\x19u' + elif elem.tag == '{http://www.w3.org/1999/xhtml}em': + message += '\x19i' + elif elem.tag == '{http://www.w3.org/1999/xhtml}img' and 'src' in elem.attrib: + if elem.attrib['alt']: + message += '%s (%s)' % (elem.attrib['src'], elem.attrib['alt']) + else: + message += elem.attrib['src'] + elif elem.tag == '{http://www.w3.org/1999/xhtml}li': + pass + elif elem.tag == '{http://www.w3.org/1999/xhtml}ol': + pass + elif elem.tag == '{http://www.w3.org/1999/xhtml}p': + pass + elif elem.tag == '{http://www.w3.org/1999/xhtml}span': + pass + elif elem.tag == '{http://www.w3.org/1999/xhtml}strong': + message += '\x19b' + elif elem.tag == '{http://www.w3.org/1999/xhtml}ul': + pass + + if ('style' in elem.attrib and elem.tag != '{http://www.w3.org/1999/xhtml}br' + and elem.tag != '{http://www.w3.org/1999/xhtml}em' + and elem.tag != '{http://www.w3.org/1999/xhtml}strong'): + message += parse_css(elem.attrib['style']) + + if (elem.text and elem.tag != '{http://www.w3.org/1999/xhtml}a' + and elem.tag != '{http://www.w3.org/1999/xhtml}br' + and elem.tag != '{http://www.w3.org/1999/xhtml}img'): + message += elem.text + + if ('style' in elem.attrib and elem.tag != '{http://www.w3.org/1999/xhtml}br' + and elem.tag != '{http://www.w3.org/1999/xhtml}em' + and elem.tag != '{http://www.w3.org/1999/xhtml}strong'): + message += '\x19o' + + if elem.tag == '{http://www.w3.org/1999/xhtml}blockquote': + message += '”' + elif elem.tag == '{http://www.w3.org/1999/xhtml}cite': + message += '\x19o' + elif elem.tag == '{http://www.w3.org/1999/xhtml}em': + message += '\x19o' + elif elem.tag == '{http://www.w3.org/1999/xhtml}strong' or elem.tag == '{http://www.w3.org/1999/xhtml}b': + message += '\x19o' + elif elem.tag == '{http://www.w3.org/1999/xhtml}u': + message += '\x19o' + + if 'title' in elem.attrib: + message += ' [' + elem.attrib['title'] + ']' + + if elem.tail: + message += elem.tail + return message def clean_text(string): @@ -148,47 +223,6 @@ def poezio_colors_to_html(string): res += "

" return res.replace('\n', '
') -def shell_colors_to_poezio_colors(string): - """ - 'shell colors' means something like: - - Bonjour ^[[0;32msalut^[[0m - - The current understanding of this syntax is: - n = 0: reset all attributes to defaults - n = 1: activate bold - n >= 30 and n <= 37: set the foreground to n-30 - - """ - def repl(matchobj): - exp = matchobj.group(0)[2:-1] - numbers = [int(nb) for nb in exp.split(';')] - res = '' - for num in numbers: - if num == 0: - res += '\x19o' - elif num == 1: - res += '\x19b' - elif num >= 31 and num <= 37: - res += '\x19%d' % ((num-30)%7,) - return res - def remove_elinks_indent(string): - lines = string.split('\n') - for i, line in enumerate(lines): - lines[i] = re.sub(' ', '', line, 1) - return '\n'.join(lines) - res = shell_colors_re.sub(repl, string).strip() - res = remove_elinks_indent(res) - return res - -def xhtml_code_to_shell_colors(string): - """ - Use a console browser to parse the xhtml and - make it return a shell-colored string - """ - process = subprocess.Popen(["elinks", "-dump", "-dump-color-mode", "2"], stdout=subprocess.PIPE, stdin=subprocess.PIPE) - result = process.communicate(input=string.encode('utf-8'))[0] - return result.decode('utf-8').strip() def poezio_colors_to_xhtml(string): """ -- cgit v1.2.3