summaryrefslogtreecommitdiff
path: root/poezio/xhtml.py
diff options
context:
space:
mode:
authorEmmanuel Gil Peyrot <linkmauve@linkmauve.fr>2016-03-31 18:54:41 +0100
committerEmmanuel Gil Peyrot <linkmauve@linkmauve.fr>2016-06-11 20:49:43 +0100
commit332a5c2553db41de777473a1e1be9cd1522c9496 (patch)
tree3ee06a59f147ccc4009b35cccfbe2461bcd18310 /poezio/xhtml.py
parentcf44cf7cdec9fdb35caa372563d57e7045dc29dd (diff)
downloadpoezio-332a5c2553db41de777473a1e1be9cd1522c9496.tar.gz
poezio-332a5c2553db41de777473a1e1be9cd1522c9496.tar.bz2
poezio-332a5c2553db41de777473a1e1be9cd1522c9496.tar.xz
poezio-332a5c2553db41de777473a1e1be9cd1522c9496.zip
Move the src directory to poezio, for better cython compatibility.
Diffstat (limited to 'poezio/xhtml.py')
-rw-r--r--poezio/xhtml.py543
1 files changed, 543 insertions, 0 deletions
diff --git a/poezio/xhtml.py b/poezio/xhtml.py
new file mode 100644
index 00000000..b84ce943
--- /dev/null
+++ b/poezio/xhtml.py
@@ -0,0 +1,543 @@
+# Copyright 2010-2011 Florent Le Coz <louiz@louiz.org>
+#
+# This file is part of Poezio.
+#
+# Poezio is free software: you can redistribute it and/or modify
+# it under the terms of the zlib license. See the COPYING file.
+
+"""
+Various methods to convert
+shell colors to poezio colors,
+xhtml code to shell colors,
+poezio colors to xhtml code
+"""
+
+import base64
+import curses
+import hashlib
+import re
+from os import path
+from slixmpp.xmlstream import ET
+from urllib.parse import unquote
+
+from io import BytesIO
+from xml import sax
+from xml.sax import saxutils
+
+digits = '0123456789' # never trust the modules
+
+XHTML_NS = 'http://www.w3.org/1999/xhtml'
+
+# HTML named colors
+colors = {
+ 'aliceblue': 231,
+ 'antiquewhite': 231,
+ 'aqua': 51,
+ 'aquamarine': 122,
+ 'azure': 231,
+ 'beige': 231,
+ 'bisque': 230,
+ 'black': 232,
+ 'blanchedalmond': 230,
+ 'blue': 21,
+ 'blueviolet': 135,
+ 'brown': 124,
+ 'burlywood': 223,
+ 'cadetblue': 109,
+ 'chartreuse': 118,
+ 'chocolate': 172,
+ 'coral': 209,
+ 'cornflowerblue': 111,
+ 'cornsilk': 231,
+ 'crimson': 197,
+ 'cyan': 51,
+ 'darkblue': 19,
+ 'darkcyan': 37,
+ 'darkgoldenrod': 178,
+ 'darkgray': 247,
+ 'darkgreen': 28,
+ 'darkgrey': 247,
+ 'darkkhaki': 186,
+ 'darkmagenta': 127,
+ 'darkolivegreen': 65,
+ 'darkorange': 214,
+ 'darkorchid': 134,
+ 'darkred': 124,
+ 'darksalmon': 216,
+ 'darkseagreen': 151,
+ 'darkslateblue': 61,
+ 'darkslategray': 59,
+ 'darkslategrey': 59,
+ 'darkturquoise': 44,
+ 'darkviolet': 128,
+ 'deeppink': 199,
+ 'deepskyblue': 45,
+ 'dimgray': 241,
+ 'dimgrey': 241,
+ 'dodgerblue': 39,
+ 'firebrick': 160,
+ 'floralwhite': 231,
+ 'forestgreen': 34,
+ 'fuchsia': 201,
+ 'gainsboro': 252,
+ 'ghostwhite': 231,
+ 'gold': 226,
+ 'goldenrod': 214,
+ 'gray': 244,
+ 'green': 34,
+ 'greenyellow': 191,
+ 'grey': 244,
+ 'honeydew': 231,
+ 'hotpink': 212,
+ 'indianred': 174,
+ 'indigo': 55,
+ 'ivory': 231,
+ 'khaki': 229,
+ 'lavender': 231,
+ 'lavenderblush': 231,
+ 'lawngreen': 118,
+ 'lemonchiffon': 230,
+ 'lightblue': 195,
+ 'lightcoral': 217,
+ 'lightcyan': 231,
+ 'lightgoldenrodyellow': 230,
+ 'lightgray': 251,
+ 'lightgreen': 157,
+ 'lightgrey': 251,
+ 'lightpink': 224,
+ 'lightsalmon': 216,
+ 'lightseagreen': 43,
+ 'lightskyblue': 153,
+ 'lightslategray': 109,
+ 'lightslategrey': 109,
+ 'lightsteelblue': 189,
+ 'lightyellow': 231,
+ 'lime': 46,
+ 'limegreen': 77,
+ 'linen': 231,
+ 'magenta': 201,
+ 'maroon': 124,
+ 'mediumaquamarine': 115,
+ 'mediumblue': 20,
+ 'mediumorchid': 170,
+ 'mediumpurple': 141,
+ 'mediumseagreen': 78,
+ 'mediumslateblue': 105,
+ 'mediumspringgreen': 49,
+ 'mediumturquoise': 80,
+ 'mediumvioletred': 163,
+ 'midnightblue': 18,
+ 'mintcream': 231,
+ 'mistyrose': 231,
+ 'moccasin': 230,
+ 'navajowhite': 230,
+ 'navy': 19,
+ 'oldlace': 231,
+ 'olive': 142,
+ 'olivedrab': 106,
+ 'orange': 214,
+ 'orangered': 202,
+ 'orchid': 213,
+ 'palegoldenrod': 229,
+ 'palegreen': 157,
+ 'paleturquoise': 195,
+ 'palevioletred': 211,
+ 'papayawhip': 231,
+ 'peachpuff': 230,
+ 'peru': 179,
+ 'pink': 224,
+ 'plum': 219,
+ 'powderblue': 195,
+ 'purple': 127,
+ 'red': 196,
+ 'rosybrown': 181,
+ 'royalblue': 69,
+ 'saddlebrown': 130,
+ 'salmon': 216,
+ 'sandybrown': 216,
+ 'seagreen': 72,
+ 'seashell': 231,
+ 'sienna': 131,
+ 'silver': 250,
+ 'skyblue': 153,
+ 'slateblue': 104,
+ 'slategray': 109,
+ 'slategrey': 109,
+ 'snow': 231,
+ 'springgreen': 48,
+ 'steelblue': 74,
+ 'tan': 187,
+ 'teal': 37,
+ 'thistle': 225,
+ 'tomato': 209,
+ 'turquoise': 86,
+ 'violet': 219,
+ 'wheat': 230,
+ 'white': 255,
+ 'whitesmoke': 255,
+ 'yellow': 226,
+ 'yellowgreen': 149
+}
+
+whitespace_re = re.compile(r'\s+')
+
+xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]')
+xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)')
+poezio_color_double = re.compile(r'(?:\x19\d+}|\x19\d)+(\x19\d|\x19\d+})')
+poezio_format_trim = re.compile(r'(\x19\d+}|\x19\d|\x19[buaio]|\x19o)+\x19o')
+
+xhtml_simple_attr_re = re.compile(r'\x19\d')
+
+def get_body_from_message_stanza(message, use_xhtml=False,
+ tmp_dir=None, extract_images=False):
+ """
+ Returns a string with xhtml markups converted to
+ poezio colors if there's an xhtml_im element, or
+ the body (without any color) otherwise
+ """
+ if use_xhtml:
+ xhtml = message['html'].xml
+ xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body')
+ if xhtml_body:
+ content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir,
+ extract_images=extract_images)
+ content = content if content else message['body']
+ return content or " "
+ return message['body']
+
+def ncurses_color_to_html(color):
+ """
+ Takes an int between 0 and 256 and returns
+ a string of the form #XXXXXX representing an
+ html color.
+ """
+ if color <= 15:
+ try:
+ (r, g, b) = curses.color_content(color)
+ except: # fallback in faulty terminals (e.g. xterm)
+ (r, g, b) = curses.color_content(color%8)
+ r = r / 1000 * 6 - 0.01
+ g = g / 1000 * 6 - 0.01
+ b = b / 1000 * 6 - 0.01
+ elif color <= 231:
+ color = color - 16
+ r = color % 6
+ color = color / 6
+ g = color % 6
+ color = color / 6
+ b = color % 6
+ else:
+ color -= 232
+ r = g = b = color / 24 * 6
+ return '#%02X%02X%02X' % (r*256/6, g*256/6, b*256/6)
+
+def parse_css(css):
+ def get_color(value):
+ if value[0] == '#':
+ value = value[1:]
+ length = len(value)
+ if length != 3 and length != 6:
+ return -1
+ value = int(value, 16)
+ if length == 6:
+ r = int(value >> 16)
+ g = int((value >> 8) & 0xff)
+ b = int(value & 0xff)
+ if r == g == b:
+ return 232 + int(r/10.6251)
+ div = 42.51
+ else:
+ r = int(value >> 8)
+ g = int((value >> 4) & 0xf)
+ b = int(value & 0xf)
+ if r == g == b:
+ return 232 + int(1.54*r)
+ div = 2.51
+ return 6*6*int(r/div) + 6*int(g/div) + int(b/div) + 16
+ if value in colors:
+ return colors[value]
+ return -1
+ shell = ''
+ rules = css.split(';')
+ for rule in rules:
+ if ':' not in rule:
+ continue
+ key, value = rule.split(':', 1)
+ key = key.strip()
+ value = value.strip()
+ if key == 'background-color':
+ pass#shell += '\x191'
+ elif key == 'color':
+ color = get_color(value)
+ if color != -1:
+ shell += '\x19%d}' % color
+ elif key == 'font-style':
+ shell += '\x19i'
+ elif key == 'font-weight':
+ shell += '\x19b'
+ elif key == 'margin-left':
+ shell += ' '
+ elif key == 'text-align':
+ pass
+ elif key == 'text-decoration':
+ if value == 'underline':
+ shell += '\x19u'
+ elif value == 'blink':
+ shell += '\x19a'
+ return shell
+
+def trim(string):
+ return re.sub(whitespace_re, ' ', string)
+
+class XHTMLHandler(sax.ContentHandler):
+ def __init__(self, force_ns=False, tmp_dir=None, extract_images=False):
+ self.builder = []
+ self.formatting = []
+ self.attrs = []
+ self.list_state = []
+ self.is_pre = False
+ self.a_start = 0
+ # do not care about xhtml-in namespace
+ self.force_ns = force_ns
+
+ self.tmp_dir = tmp_dir
+ self.extract_images = extract_images
+
+ @property
+ def result(self):
+ sanitized = re.sub(poezio_color_double, r'\1', ''.join(self.builder).strip())
+ return re.sub(poezio_format_trim, '\x19o', sanitized)
+
+ def append_formatting(self, formatting):
+ self.formatting.append(formatting)
+ self.builder.append(formatting)
+
+ def pop_formatting(self):
+ self.formatting.pop()
+ self.builder.append('\x19o' + ''.join(self.formatting))
+
+ def characters(self, characters):
+ self.builder.append(characters if self.is_pre else trim(characters))
+
+ def startElementNS(self, name, _, attrs):
+ if name[0] != XHTML_NS and not self.force_ns:
+ return
+
+ builder = self.builder
+ attrs = {name: value for ((ns, name), value) in attrs.items() if ns is None}
+ self.attrs.append(attrs)
+
+ if 'style' in attrs:
+ style = parse_css(attrs['style'])
+ self.append_formatting(style)
+
+ name = name[1]
+ if name == 'a':
+ self.append_formatting('\x19u')
+ self.a_start = len(self.builder)
+ elif name == 'blockquote':
+ builder.append('ā€œ')
+ elif name == 'br':
+ builder.append('\n')
+ elif name == 'cite':
+ self.append_formatting('\x19u')
+ elif name == 'em':
+ self.append_formatting('\x19i')
+ elif name == 'img':
+ if re.match(xhtml_data_re, attrs['src']) and self.extract_images:
+ type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i]
+ bin_data = base64.b64decode(unquote(data))
+ filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_
+ filepath = path.join(self.tmp_dir, filename)
+ if not path.exists(filepath):
+ try:
+ with open(filepath, 'wb') as fd:
+ fd.write(bin_data)
+ builder.append('file://%s' % filepath)
+ except Exception as e:
+ builder.append('[Error while saving image: %s]' % e)
+ else:
+ builder.append('file://%s' % filepath)
+ else:
+ builder.append(trim(attrs['src']))
+ if 'alt' in attrs:
+ builder.append(' (%s)' % trim(attrs['alt']))
+ elif name == 'ul':
+ self.list_state.append('ul')
+ elif name == 'ol':
+ self.list_state.append(1)
+ elif name == 'li':
+ try:
+ state = self.list_state[-1]
+ except IndexError:
+ state = 'ul'
+ if state == 'ul':
+ builder.append('\nā€¢ ')
+ else:
+ builder.append('\n%d) ' % state)
+ state += 1
+ self.list_state[-1] = state
+ elif name == 'p':
+ builder.append('\n')
+ elif name == 'pre':
+ builder.append('\n')
+ self.is_pre = True
+ elif name == 'strong':
+ self.append_formatting('\x19b')
+
+ def endElementNS(self, name, _):
+ if name[0] != XHTML_NS and not self.force_ns:
+ return
+
+ builder = self.builder
+ attrs = self.attrs.pop()
+ name = name[1]
+
+ if name == 'a':
+ self.pop_formatting()
+ # do not display the link twice
+ text_elements = filter(lambda x: not x.startswith('\x19'),
+ self.builder[self.a_start:])
+ link_text = ''.join(text_elements).strip()
+ if 'href' in attrs and attrs['href'] != link_text:
+ builder.append(' (%s)' % trim(attrs['href']))
+ elif name == 'blockquote':
+ builder.append('ā€')
+ elif name in ('cite', 'em', 'strong'):
+ self.pop_formatting()
+ elif name in ('ol', 'p', 'ul'):
+ builder.append('\n')
+ elif name == 'pre':
+ builder.append('\n')
+ self.is_pre = False
+
+ if 'style' in attrs:
+ self.pop_formatting()
+
+ if 'title' in attrs:
+ builder.append(' [' + attrs['title'] + ']')
+
+def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None):
+ if isinstance(xml, str):
+ xml = xml.encode('utf8')
+ elif not isinstance(xml, bytes):
+ xml = ET.tostring(xml)
+
+ handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir,
+ extract_images=extract_images)
+ parser = sax.make_parser()
+ parser.setFeature(sax.handler.feature_namespaces, True)
+ parser.setContentHandler(handler)
+ parser.parse(BytesIO(xml))
+ return handler.result
+
+def clean_text(s):
+ """
+ Remove all xhtml-im attributes (\x19etc) from the string with the
+ complete color format, i.e \x19xxx}
+ """
+ s = re.sub(xhtml_attr_re, "", s)
+ return s
+
+def clean_text_simple(string):
+ """
+ Remove all \x19 from the string formatted with simple colors:
+ \x198
+ """
+ pos = string.find('\x19')
+ while pos != -1:
+ string = string[:pos] + string[pos+2:]
+ pos = string.find('\x19')
+ return string
+
+def convert_simple_to_full_colors(text):
+ """
+ takes a \x19n formatted string and returns
+ a \x19n} formatted one.
+ """
+ # TODO, have a single list of this. This is some sort of
+ # dusplicate from windows.format_chars
+ mapping = str.maketrans({'\x0E': '\x19b', '\x0F': '\x19o', '\x10': '\x19u',
+ '\x11': '\x191', '\x12': '\x192', '\x13': '\x193',
+ '\x14': '\x194', '\x15': '\x195', '\x16': '\x196',
+ '\x17': '\x197', '\x18': '\x198', '\x19': '\x199'})
+ text = text.translate(mapping)
+ def add_curly_bracket(match):
+ return match.group(0) + '}'
+ return re.sub(xhtml_simple_attr_re, add_curly_bracket, text)
+
+number_to_color_names = {
+ 1: 'red',
+ 2: 'green',
+ 3: 'yellow',
+ 4: 'blue',
+ 5: 'violet',
+ 6: 'turquoise',
+ 7: 'white'
+}
+
+def format_inline_css(_dict):
+ return ''.join(('%s: %s;' % (key, value) for key, value in _dict.items()))
+
+def poezio_colors_to_html(string):
+ """
+ Convert poezio colors to html
+ (e.g. \x191}: <span style='color: red'>)
+ """
+ # Maintain a list of the current css attributes used
+ # And check if a tag is open (by design, we only open
+ # spans tag, and they cannot be nested.
+ current_attrs = {}
+ tag_open = False
+ next_attr_char = string.find('\x19')
+ build = ["<body xmlns='http://www.w3.org/1999/xhtml'><p>"]
+
+ def check_property(key, value):
+ nonlocal tag_open
+ if current_attrs.get(key, None) == value:
+ return
+ current_attrs[key] = value
+ if tag_open:
+ tag_open = False
+ build.append('</span>')
+
+ while next_attr_char != -1:
+ attr_char = string[next_attr_char+1].lower()
+
+ if next_attr_char != 0 and string[:next_attr_char]:
+ if current_attrs and not tag_open:
+ build.append('<span style="%s">' % format_inline_css(current_attrs))
+ tag_open = True
+ build.append(saxutils.escape(string[:next_attr_char]))
+
+ if attr_char == 'o':
+ if tag_open:
+ build.append('</span>')
+ tag_open = False
+ current_attrs = {}
+ elif attr_char == 'b':
+ check_property('font-weight', 'bold')
+ elif attr_char == 'u':
+ check_property('text-decoration', 'underline')
+
+ if attr_char in digits:
+ number_str = string[next_attr_char+1:string.find('}', next_attr_char)]
+ number = int(number_str)
+ if number in number_to_color_names:
+ check_property('color', number_to_color_names.get(number, 'black'))
+ else:
+ check_property('color', ncurses_color_to_html(number))
+ string = string[next_attr_char+len(number_str)+2:]
+ else:
+ string = string[next_attr_char+2:]
+ next_attr_char = string.find('\x19')
+
+ if current_attrs and not tag_open and string:
+ build.append('<span style="%s">' % format_inline_css(current_attrs))
+ tag_open = True
+ build.append(saxutils.escape(string))
+ if tag_open:
+ build.append('</span>')
+ build.append("</p></body>")
+ text = ''.join(build)
+ return text.replace('\n', '<br />')