From a9f642f7438fe4489cdb9cc5ac59c929054656c8 Mon Sep 17 00:00:00 2001 From: mathieui Date: Thu, 16 Oct 2014 18:49:32 +0200 Subject: Extract XHTML-IM inline imags by default - Add two new options: tmp_image_dir and extract_inline_images - tmp_image_dir is $XDG_CACHE_HOME(usually ~/.cache)/poezio/images if unset - Name the images from a SHA-1 of their data and their mimetype - Output file:// links inside the message --- src/xhtml.py | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'src/xhtml.py') diff --git a/src/xhtml.py b/src/xhtml.py index 48664311..69519f8d 100644 --- a/src/xhtml.py +++ b/src/xhtml.py @@ -12,9 +12,13 @@ xhtml code to shell colors, poezio colors to xhtml code """ -import re +import base64 import curses +import hashlib +import re +from os import path from sleekxmpp.xmlstream import ET +from urllib.parse import unquote from io import BytesIO from xml import sax @@ -178,10 +182,12 @@ colors = { whitespace_re = re.compile(r'\s+') xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]') +xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)') xhtml_simple_attr_re = re.compile(r'\x19\d') -def get_body_from_message_stanza(message, use_xhtml=False): +def get_body_from_message_stanza(message, use_xhtml=False, + tmp_dir=None, extract_images=False): """ Returns a string with xhtml markups converted to poezio colors if there's an xhtml_im element, or @@ -191,7 +197,8 @@ def get_body_from_message_stanza(message, use_xhtml=False): xhtml = message['html'].xml xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body') if xhtml_body: - content = xhtml_to_poezio_colors(xhtml_body) + content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir, + extract_images=extract_images) content = content if content else message['body'] return content or " " return message['body'] @@ -281,7 +288,7 @@ def trim(string): return re.sub(whitespace_re, ' ', string) class XHTMLHandler(sax.ContentHandler): - def __init__(self, force_ns=False): + def __init__(self, force_ns=False, tmp_dir=None, extract_images=False): self.builder = [] self.formatting = [] self.attrs = [] @@ -291,6 +298,9 @@ class XHTMLHandler(sax.ContentHandler): # do not care about xhtml-in namespace self.force_ns = force_ns + self.tmp_dir = tmp_dir + self.extract_images = extract_images + @property def result(self): return ''.join(self.builder).strip() @@ -331,7 +341,22 @@ class XHTMLHandler(sax.ContentHandler): elif name == 'em': self.append_formatting('\x19i') elif name == 'img': - builder.append(trim(attrs['src'])) + if re.match(xhtml_data_re, attrs['src']) and self.extract_images: + type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i] + bin_data = base64.b64decode(unquote(data)) + filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_ + filepath = path.join(self.tmp_dir, filename) + if not path.exists(filepath): + try: + with open(filepath, 'wb') as fd: + fd.write(bin_data) + builder.append('file://%s' % filepath) + except Exception as e: + builder.append('[Error while saving image: %s]' % e) + else: + builder.append('file://%s' % filepath) + else: + builder.append(trim(attrs['src'])) if 'alt' in attrs: builder.append(' (%s)' % trim(attrs['alt'])) elif name == 'ul': @@ -389,13 +414,14 @@ class XHTMLHandler(sax.ContentHandler): if 'title' in attrs: builder.append(' [' + attrs['title'] + ']') -def xhtml_to_poezio_colors(xml, force=False): +def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None): if isinstance(xml, str): xml = xml.encode('utf8') elif not isinstance(xml, bytes): xml = ET.tostring(xml) - handler = XHTMLHandler(force_ns=force) + handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir, + extract_images=extract_images) parser = sax.make_parser() parser.setFeature(sax.handler.feature_namespaces, True) parser.setContentHandler(handler) -- cgit v1.2.3