From 332a5c2553db41de777473a1e1be9cd1522c9496 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Thu, 31 Mar 2016 18:54:41 +0100 Subject: Move the src directory to poezio, for better cython compatibility. --- poezio/xhtml.py | 543 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 543 insertions(+) create mode 100644 poezio/xhtml.py (limited to 'poezio/xhtml.py') diff --git a/poezio/xhtml.py b/poezio/xhtml.py new file mode 100644 index 00000000..b84ce943 --- /dev/null +++ b/poezio/xhtml.py @@ -0,0 +1,543 @@ +# Copyright 2010-2011 Florent Le Coz +# +# This file is part of Poezio. +# +# Poezio is free software: you can redistribute it and/or modify +# it under the terms of the zlib license. See the COPYING file. + +""" +Various methods to convert +shell colors to poezio colors, +xhtml code to shell colors, +poezio colors to xhtml code +""" + +import base64 +import curses +import hashlib +import re +from os import path +from slixmpp.xmlstream import ET +from urllib.parse import unquote + +from io import BytesIO +from xml import sax +from xml.sax import saxutils + +digits = '0123456789' # never trust the modules + +XHTML_NS = 'http://www.w3.org/1999/xhtml' + +# HTML named colors +colors = { + 'aliceblue': 231, + 'antiquewhite': 231, + 'aqua': 51, + 'aquamarine': 122, + 'azure': 231, + 'beige': 231, + 'bisque': 230, + 'black': 232, + 'blanchedalmond': 230, + 'blue': 21, + 'blueviolet': 135, + 'brown': 124, + 'burlywood': 223, + 'cadetblue': 109, + 'chartreuse': 118, + 'chocolate': 172, + 'coral': 209, + 'cornflowerblue': 111, + 'cornsilk': 231, + 'crimson': 197, + 'cyan': 51, + 'darkblue': 19, + 'darkcyan': 37, + 'darkgoldenrod': 178, + 'darkgray': 247, + 'darkgreen': 28, + 'darkgrey': 247, + 'darkkhaki': 186, + 'darkmagenta': 127, + 'darkolivegreen': 65, + 'darkorange': 214, + 'darkorchid': 134, + 'darkred': 124, + 'darksalmon': 216, + 'darkseagreen': 151, + 'darkslateblue': 61, + 'darkslategray': 59, + 'darkslategrey': 59, + 'darkturquoise': 44, + 'darkviolet': 128, + 'deeppink': 199, + 'deepskyblue': 45, + 'dimgray': 241, + 'dimgrey': 241, + 'dodgerblue': 39, + 'firebrick': 160, + 'floralwhite': 231, + 'forestgreen': 34, + 'fuchsia': 201, + 'gainsboro': 252, + 'ghostwhite': 231, + 'gold': 226, + 'goldenrod': 214, + 'gray': 244, + 'green': 34, + 'greenyellow': 191, + 'grey': 244, + 'honeydew': 231, + 'hotpink': 212, + 'indianred': 174, + 'indigo': 55, + 'ivory': 231, + 'khaki': 229, + 'lavender': 231, + 'lavenderblush': 231, + 'lawngreen': 118, + 'lemonchiffon': 230, + 'lightblue': 195, + 'lightcoral': 217, + 'lightcyan': 231, + 'lightgoldenrodyellow': 230, + 'lightgray': 251, + 'lightgreen': 157, + 'lightgrey': 251, + 'lightpink': 224, + 'lightsalmon': 216, + 'lightseagreen': 43, + 'lightskyblue': 153, + 'lightslategray': 109, + 'lightslategrey': 109, + 'lightsteelblue': 189, + 'lightyellow': 231, + 'lime': 46, + 'limegreen': 77, + 'linen': 231, + 'magenta': 201, + 'maroon': 124, + 'mediumaquamarine': 115, + 'mediumblue': 20, + 'mediumorchid': 170, + 'mediumpurple': 141, + 'mediumseagreen': 78, + 'mediumslateblue': 105, + 'mediumspringgreen': 49, + 'mediumturquoise': 80, + 'mediumvioletred': 163, + 'midnightblue': 18, + 'mintcream': 231, + 'mistyrose': 231, + 'moccasin': 230, + 'navajowhite': 230, + 'navy': 19, + 'oldlace': 231, + 'olive': 142, + 'olivedrab': 106, + 'orange': 214, + 'orangered': 202, + 'orchid': 213, + 'palegoldenrod': 229, + 'palegreen': 157, + 'paleturquoise': 195, + 'palevioletred': 211, + 'papayawhip': 231, + 'peachpuff': 230, + 'peru': 179, + 'pink': 224, + 'plum': 219, + 'powderblue': 195, + 'purple': 127, + 'red': 196, + 'rosybrown': 181, + 'royalblue': 69, + 'saddlebrown': 130, + 'salmon': 216, + 'sandybrown': 216, + 'seagreen': 72, + 'seashell': 231, + 'sienna': 131, + 'silver': 250, + 'skyblue': 153, + 'slateblue': 104, + 'slategray': 109, + 'slategrey': 109, + 'snow': 231, + 'springgreen': 48, + 'steelblue': 74, + 'tan': 187, + 'teal': 37, + 'thistle': 225, + 'tomato': 209, + 'turquoise': 86, + 'violet': 219, + 'wheat': 230, + 'white': 255, + 'whitesmoke': 255, + 'yellow': 226, + 'yellowgreen': 149 +} + +whitespace_re = re.compile(r'\s+') + +xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]') +xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)') +poezio_color_double = re.compile(r'(?:\x19\d+}|\x19\d)+(\x19\d|\x19\d+})') +poezio_format_trim = re.compile(r'(\x19\d+}|\x19\d|\x19[buaio]|\x19o)+\x19o') + +xhtml_simple_attr_re = re.compile(r'\x19\d') + +def get_body_from_message_stanza(message, use_xhtml=False, + tmp_dir=None, extract_images=False): + """ + Returns a string with xhtml markups converted to + poezio colors if there's an xhtml_im element, or + the body (without any color) otherwise + """ + if use_xhtml: + xhtml = message['html'].xml + xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body') + if xhtml_body: + content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir, + extract_images=extract_images) + content = content if content else message['body'] + return content or " " + return message['body'] + +def ncurses_color_to_html(color): + """ + Takes an int between 0 and 256 and returns + a string of the form #XXXXXX representing an + html color. + """ + if color <= 15: + try: + (r, g, b) = curses.color_content(color) + except: # fallback in faulty terminals (e.g. xterm) + (r, g, b) = curses.color_content(color%8) + r = r / 1000 * 6 - 0.01 + g = g / 1000 * 6 - 0.01 + b = b / 1000 * 6 - 0.01 + elif color <= 231: + color = color - 16 + r = color % 6 + color = color / 6 + g = color % 6 + color = color / 6 + b = color % 6 + else: + color -= 232 + r = g = b = color / 24 * 6 + return '#%02X%02X%02X' % (r*256/6, g*256/6, b*256/6) + +def parse_css(css): + def get_color(value): + if value[0] == '#': + value = value[1:] + length = len(value) + if length != 3 and length != 6: + return -1 + value = int(value, 16) + if length == 6: + r = int(value >> 16) + g = int((value >> 8) & 0xff) + b = int(value & 0xff) + if r == g == b: + return 232 + int(r/10.6251) + div = 42.51 + else: + r = int(value >> 8) + g = int((value >> 4) & 0xf) + b = int(value & 0xf) + if r == g == b: + return 232 + int(1.54*r) + div = 2.51 + return 6*6*int(r/div) + 6*int(g/div) + int(b/div) + 16 + if value in colors: + return colors[value] + return -1 + shell = '' + rules = css.split(';') + for rule in rules: + if ':' not in rule: + continue + key, value = rule.split(':', 1) + key = key.strip() + value = value.strip() + if key == 'background-color': + pass#shell += '\x191' + elif key == 'color': + color = get_color(value) + if color != -1: + shell += '\x19%d}' % color + elif key == 'font-style': + shell += '\x19i' + elif key == 'font-weight': + shell += '\x19b' + elif key == 'margin-left': + shell += ' ' + elif key == 'text-align': + pass + elif key == 'text-decoration': + if value == 'underline': + shell += '\x19u' + elif value == 'blink': + shell += '\x19a' + return shell + +def trim(string): + return re.sub(whitespace_re, ' ', string) + +class XHTMLHandler(sax.ContentHandler): + def __init__(self, force_ns=False, tmp_dir=None, extract_images=False): + self.builder = [] + self.formatting = [] + self.attrs = [] + self.list_state = [] + self.is_pre = False + self.a_start = 0 + # do not care about xhtml-in namespace + self.force_ns = force_ns + + self.tmp_dir = tmp_dir + self.extract_images = extract_images + + @property + def result(self): + sanitized = re.sub(poezio_color_double, r'\1', ''.join(self.builder).strip()) + return re.sub(poezio_format_trim, '\x19o', sanitized) + + def append_formatting(self, formatting): + self.formatting.append(formatting) + self.builder.append(formatting) + + def pop_formatting(self): + self.formatting.pop() + self.builder.append('\x19o' + ''.join(self.formatting)) + + def characters(self, characters): + self.builder.append(characters if self.is_pre else trim(characters)) + + def startElementNS(self, name, _, attrs): + if name[0] != XHTML_NS and not self.force_ns: + return + + builder = self.builder + attrs = {name: value for ((ns, name), value) in attrs.items() if ns is None} + self.attrs.append(attrs) + + if 'style' in attrs: + style = parse_css(attrs['style']) + self.append_formatting(style) + + name = name[1] + if name == 'a': + self.append_formatting('\x19u') + self.a_start = len(self.builder) + elif name == 'blockquote': + builder.append('ā€œ') + elif name == 'br': + builder.append('\n') + elif name == 'cite': + self.append_formatting('\x19u') + elif name == 'em': + self.append_formatting('\x19i') + elif name == 'img': + if re.match(xhtml_data_re, attrs['src']) and self.extract_images: + type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i] + bin_data = base64.b64decode(unquote(data)) + filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_ + filepath = path.join(self.tmp_dir, filename) + if not path.exists(filepath): + try: + with open(filepath, 'wb') as fd: + fd.write(bin_data) + builder.append('file://%s' % filepath) + except Exception as e: + builder.append('[Error while saving image: %s]' % e) + else: + builder.append('file://%s' % filepath) + else: + builder.append(trim(attrs['src'])) + if 'alt' in attrs: + builder.append(' (%s)' % trim(attrs['alt'])) + elif name == 'ul': + self.list_state.append('ul') + elif name == 'ol': + self.list_state.append(1) + elif name == 'li': + try: + state = self.list_state[-1] + except IndexError: + state = 'ul' + if state == 'ul': + builder.append('\nā€¢ ') + else: + builder.append('\n%d) ' % state) + state += 1 + self.list_state[-1] = state + elif name == 'p': + builder.append('\n') + elif name == 'pre': + builder.append('\n') + self.is_pre = True + elif name == 'strong': + self.append_formatting('\x19b') + + def endElementNS(self, name, _): + if name[0] != XHTML_NS and not self.force_ns: + return + + builder = self.builder + attrs = self.attrs.pop() + name = name[1] + + if name == 'a': + self.pop_formatting() + # do not display the link twice + text_elements = filter(lambda x: not x.startswith('\x19'), + self.builder[self.a_start:]) + link_text = ''.join(text_elements).strip() + if 'href' in attrs and attrs['href'] != link_text: + builder.append(' (%s)' % trim(attrs['href'])) + elif name == 'blockquote': + builder.append('ā€') + elif name in ('cite', 'em', 'strong'): + self.pop_formatting() + elif name in ('ol', 'p', 'ul'): + builder.append('\n') + elif name == 'pre': + builder.append('\n') + self.is_pre = False + + if 'style' in attrs: + self.pop_formatting() + + if 'title' in attrs: + builder.append(' [' + attrs['title'] + ']') + +def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None): + if isinstance(xml, str): + xml = xml.encode('utf8') + elif not isinstance(xml, bytes): + xml = ET.tostring(xml) + + handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir, + extract_images=extract_images) + parser = sax.make_parser() + parser.setFeature(sax.handler.feature_namespaces, True) + parser.setContentHandler(handler) + parser.parse(BytesIO(xml)) + return handler.result + +def clean_text(s): + """ + Remove all xhtml-im attributes (\x19etc) from the string with the + complete color format, i.e \x19xxx} + """ + s = re.sub(xhtml_attr_re, "", s) + return s + +def clean_text_simple(string): + """ + Remove all \x19 from the string formatted with simple colors: + \x198 + """ + pos = string.find('\x19') + while pos != -1: + string = string[:pos] + string[pos+2:] + pos = string.find('\x19') + return string + +def convert_simple_to_full_colors(text): + """ + takes a \x19n formatted string and returns + a \x19n} formatted one. + """ + # TODO, have a single list of this. This is some sort of + # dusplicate from windows.format_chars + mapping = str.maketrans({'\x0E': '\x19b', '\x0F': '\x19o', '\x10': '\x19u', + '\x11': '\x191', '\x12': '\x192', '\x13': '\x193', + '\x14': '\x194', '\x15': '\x195', '\x16': '\x196', + '\x17': '\x197', '\x18': '\x198', '\x19': '\x199'}) + text = text.translate(mapping) + def add_curly_bracket(match): + return match.group(0) + '}' + return re.sub(xhtml_simple_attr_re, add_curly_bracket, text) + +number_to_color_names = { + 1: 'red', + 2: 'green', + 3: 'yellow', + 4: 'blue', + 5: 'violet', + 6: 'turquoise', + 7: 'white' +} + +def format_inline_css(_dict): + return ''.join(('%s: %s;' % (key, value) for key, value in _dict.items())) + +def poezio_colors_to_html(string): + """ + Convert poezio colors to html + (e.g. \x191}: ) + """ + # Maintain a list of the current css attributes used + # And check if a tag is open (by design, we only open + # spans tag, and they cannot be nested. + current_attrs = {} + tag_open = False + next_attr_char = string.find('\x19') + build = ["

"] + + def check_property(key, value): + nonlocal tag_open + if current_attrs.get(key, None) == value: + return + current_attrs[key] = value + if tag_open: + tag_open = False + build.append('') + + while next_attr_char != -1: + attr_char = string[next_attr_char+1].lower() + + if next_attr_char != 0 and string[:next_attr_char]: + if current_attrs and not tag_open: + build.append('' % format_inline_css(current_attrs)) + tag_open = True + build.append(saxutils.escape(string[:next_attr_char])) + + if attr_char == 'o': + if tag_open: + build.append('') + tag_open = False + current_attrs = {} + elif attr_char == 'b': + check_property('font-weight', 'bold') + elif attr_char == 'u': + check_property('text-decoration', 'underline') + + if attr_char in digits: + number_str = string[next_attr_char+1:string.find('}', next_attr_char)] + number = int(number_str) + if number in number_to_color_names: + check_property('color', number_to_color_names.get(number, 'black')) + else: + check_property('color', ncurses_color_to_html(number)) + string = string[next_attr_char+len(number_str)+2:] + else: + string = string[next_attr_char+2:] + next_attr_char = string.find('\x19') + + if current_attrs and not tag_open and string: + build.append('' % format_inline_css(current_attrs)) + tag_open = True + build.append(saxutils.escape(string)) + if tag_open: + build.append('') + build.append("

") + text = ''.join(build) + return text.replace('\n', '
') -- cgit v1.2.3