From a9f642f7438fe4489cdb9cc5ac59c929054656c8 Mon Sep 17 00:00:00 2001
From: mathieui <mathieui@mathieui.net>
Date: Thu, 16 Oct 2014 18:49:32 +0200
Subject: Extract XHTML-IM inline imags by default

- Add two new options: tmp_image_dir and extract_inline_images
- tmp_image_dir is $XDG_CACHE_HOME(usually ~/.cache)/poezio/images if unset
- Name the images from a SHA-1 of their data and their mimetype
- Output file:// links inside the message
---
 src/xhtml.py | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

(limited to 'src/xhtml.py')

diff --git a/src/xhtml.py b/src/xhtml.py
index 48664311..69519f8d 100644
--- a/src/xhtml.py
+++ b/src/xhtml.py
@@ -12,9 +12,13 @@ xhtml code to shell colors,
 poezio colors to xhtml code
 """
 
-import re
+import base64
 import curses
+import hashlib
+import re
+from os import path
 from sleekxmpp.xmlstream import ET
+from urllib.parse import unquote
 
 from io import BytesIO
 from xml import sax
@@ -178,10 +182,12 @@ colors = {
 whitespace_re = re.compile(r'\s+')
 
 xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]')
+xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)')
 
 xhtml_simple_attr_re = re.compile(r'\x19\d')
 
-def get_body_from_message_stanza(message, use_xhtml=False):
+def get_body_from_message_stanza(message, use_xhtml=False,
+                                 tmp_dir=None, extract_images=False):
     """
     Returns a string with xhtml markups converted to
     poezio colors if there's an xhtml_im element, or
@@ -191,7 +197,8 @@ def get_body_from_message_stanza(message, use_xhtml=False):
         xhtml = message['html'].xml
         xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body')
         if xhtml_body:
-            content = xhtml_to_poezio_colors(xhtml_body)
+            content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir,
+                                             extract_images=extract_images)
             content = content if content else message['body']
             return content or " "
     return message['body']
@@ -281,7 +288,7 @@ def trim(string):
     return re.sub(whitespace_re, ' ', string)
 
 class XHTMLHandler(sax.ContentHandler):
-    def __init__(self, force_ns=False):
+    def __init__(self, force_ns=False, tmp_dir=None, extract_images=False):
         self.builder = []
         self.formatting = []
         self.attrs = []
@@ -291,6 +298,9 @@ class XHTMLHandler(sax.ContentHandler):
         # do not care about xhtml-in namespace
         self.force_ns = force_ns
 
+        self.tmp_dir = tmp_dir
+        self.extract_images = extract_images
+
     @property
     def result(self):
         return ''.join(self.builder).strip()
@@ -331,7 +341,22 @@ class XHTMLHandler(sax.ContentHandler):
         elif name == 'em':
             self.append_formatting('\x19i')
         elif name == 'img':
-            builder.append(trim(attrs['src']))
+            if re.match(xhtml_data_re, attrs['src']) and self.extract_images:
+                type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i]
+                bin_data = base64.b64decode(unquote(data))
+                filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_
+                filepath = path.join(self.tmp_dir, filename)
+                if not path.exists(filepath):
+                    try:
+                        with open(filepath, 'wb') as fd:
+                            fd.write(bin_data)
+                        builder.append('file://%s' % filepath)
+                    except Exception as e:
+                        builder.append('[Error while saving image: %s]' % e)
+                else:
+                    builder.append('file://%s' % filepath)
+            else:
+                builder.append(trim(attrs['src']))
             if 'alt' in attrs:
                 builder.append(' (%s)' % trim(attrs['alt']))
         elif name == 'ul':
@@ -389,13 +414,14 @@ class XHTMLHandler(sax.ContentHandler):
         if 'title' in attrs:
             builder.append(' [' + attrs['title'] + ']')
 
-def xhtml_to_poezio_colors(xml, force=False):
+def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None):
     if isinstance(xml, str):
         xml = xml.encode('utf8')
     elif not isinstance(xml, bytes):
         xml = ET.tostring(xml)
 
-    handler = XHTMLHandler(force_ns=force)
+    handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir,
+                           extract_images=extract_images)
     parser = sax.make_parser()
     parser.setFeature(sax.handler.feature_namespaces, True)
     parser.setContentHandler(handler)
-- 
cgit v1.2.3