Extract XHTML-IM inline imags by default

- Add two new options: tmp_image_dir and extract_inline_images - tmp_image_dir is $XDG_CACHE_HOME(usually ~/.cache)/poezio/images if unset - Name the images from a SHA-1 of their data and their mimetype - Output file:// links inside the message
author: mathieui <mathieui@mathieui.net> 2014-10-16 18:49:32 +0200
committer: mathieui <mathieui@mathieui.net> 2014-10-16 18:56:12 +0200
commit: a9f642f7438fe4489cdb9cc5ac59c929054656c8 (patch)
tree: 5ce6cba2105f178f92235fa325725941a5c9924c /src/xhtml.py
parent: d4590949f7b691e3e1d6eff8fa339e62a44bae51 (diff)
download: poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.tar.gz
poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.tar.bz2
poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.tar.xz
poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.zip
1 files changed, 33 insertions, 7 deletions
diff --git a/src/xhtml.py b/src/xhtml.py
index 48664311..69519f8d 100644
--- a/src/xhtml.py
+++ b/src/xhtml.py
@@ -12,9 +12,13 @@ xhtml code to shell colors,
 poezio colors to xhtml code
 """
 
-import re
+import base64
 import curses
+import hashlib
+import re
+from os import path
 from sleekxmpp.xmlstream import ET
+from urllib.parse import unquote
 
 from io import BytesIO
 from xml import sax
@@ -178,10 +182,12 @@ colors = {
 whitespace_re = re.compile(r'\s+')
 
 xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]')
+xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)')
 
 xhtml_simple_attr_re = re.compile(r'\x19\d')
 
-def get_body_from_message_stanza(message, use_xhtml=False):
+def get_body_from_message_stanza(message, use_xhtml=False,
+                                 tmp_dir=None, extract_images=False):
     """
     Returns a string with xhtml markups converted to
     poezio colors if there's an xhtml_im element, or
@@ -191,7 +197,8 @@ def get_body_from_message_stanza(message, use_xhtml=False):
         xhtml = message['html'].xml
         xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body')
         if xhtml_body:
-            content = xhtml_to_poezio_colors(xhtml_body)
+            content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir,
+                                             extract_images=extract_images)
             content = content if content else message['body']
             return content or " "
     return message['body']
@@ -281,7 +288,7 @@ def trim(string):
     return re.sub(whitespace_re, ' ', string)
 
 class XHTMLHandler(sax.ContentHandler):
-    def __init__(self, force_ns=False):
+    def __init__(self, force_ns=False, tmp_dir=None, extract_images=False):
         self.builder = []
         self.formatting = []
         self.attrs = []
@@ -291,6 +298,9 @@ class XHTMLHandler(sax.ContentHandler):
         # do not care about xhtml-in namespace
         self.force_ns = force_ns
 
+        self.tmp_dir = tmp_dir
+        self.extract_images = extract_images
+
     @property
     def result(self):
         return ''.join(self.builder).strip()
@@ -331,7 +341,22 @@ class XHTMLHandler(sax.ContentHandler):
         elif name == 'em':
             self.append_formatting('\x19i')
         elif name == 'img':
-            builder.append(trim(attrs['src']))
+            if re.match(xhtml_data_re, attrs['src']) and self.extract_images:
+                type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i]
+                bin_data = base64.b64decode(unquote(data))
+                filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_
+                filepath = path.join(self.tmp_dir, filename)
+                if not path.exists(filepath):
+                    try:
+                        with open(filepath, 'wb') as fd:
+                            fd.write(bin_data)
+                        builder.append('file://%s' % filepath)
+                    except Exception as e:
+                        builder.append('[Error while saving image: %s]' % e)
+                else:
+                    builder.append('file://%s' % filepath)
+            else:
+                builder.append(trim(attrs['src']))
             if 'alt' in attrs:
                 builder.append(' (%s)' % trim(attrs['alt']))
         elif name == 'ul':
@@ -389,13 +414,14 @@ class XHTMLHandler(sax.ContentHandler):
         if 'title' in attrs:
             builder.append(' [' + attrs['title'] + ']')
 
-def xhtml_to_poezio_colors(xml, force=False):
+def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None):
     if isinstance(xml, str):
         xml = xml.encode('utf8')
     elif not isinstance(xml, bytes):
         xml = ET.tostring(xml)
 
-    handler = XHTMLHandler(force_ns=force)
+    handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir,
+                           extract_images=extract_images)
     parser = sax.make_parser()
     parser.setFeature(sax.handler.feature_namespaces, True)
     parser.setContentHandler(handler)
author	mathieui <mathieui@mathieui.net>	2014-10-16 18:49:32 +0200
committer	mathieui <mathieui@mathieui.net>	2014-10-16 18:56:12 +0200
commit	a9f642f7438fe4489cdb9cc5ac59c929054656c8 (patch)
tree	5ce6cba2105f178f92235fa325725941a5c9924c /src/xhtml.py
parent	d4590949f7b691e3e1d6eff8fa339e62a44bae51 (diff)
download	poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.tar.gz poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.tar.bz2 poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.tar.xz poezio-a9f642f7438fe4489cdb9cc5ac59c929054656c8.zip