From a9f642f7438fe4489cdb9cc5ac59c929054656c8 Mon Sep 17 00:00:00 2001 From: mathieui Date: Thu, 16 Oct 2014 18:49:32 +0200 Subject: Extract XHTML-IM inline imags by default - Add two new options: tmp_image_dir and extract_inline_images - tmp_image_dir is $XDG_CACHE_HOME(usually ~/.cache)/poezio/images if unset - Name the images from a SHA-1 of their data and their mimetype - Output file:// links inside the message --- src/config.py | 21 ++++++++++++++++++++- src/core/handlers.py | 29 +++++++++++++++++++++++------ src/poezio.py | 1 + src/xhtml.py | 40 +++++++++++++++++++++++++++++++++------- 4 files changed, 77 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/config.py b/src/config.py index 354c3447..5bd1ac17 100644 --- a/src/config.py +++ b/src/config.py @@ -361,7 +361,6 @@ def file_ok(filepath): def check_create_config_dir(): """ create the configuration directory if it doesn't exist - and copy the default config in it """ CONFIG_HOME = environ.get("XDG_CONFIG_HOME") if not CONFIG_HOME: @@ -374,6 +373,23 @@ def check_create_config_dir(): pass return CONFIG_PATH +def check_create_cache_dir(): + """ + create the cache directory if it doesn't exist + also create the subdirectories + """ + global CACHE_DIR + CACHE_HOME = environ.get("XDG_CACHE_HOME") + if not CACHE_HOME: + CACHE_HOME = path.join(environ.get('HOME'), '.cache') + CACHE_DIR = path.join(CACHE_HOME, 'poezio') + + try: + makedirs(CACHE_DIR) + makedirs(path.join(CACHE_DIR, 'images')) + except OSError: + pass + def run_cmdline_args(CONFIG_PATH): "Parse the command line arguments" global options @@ -495,3 +511,6 @@ safeJID = None # the global log dir LOG_DIR = '' + +# the global cache dir +CACHE_DIR = '' diff --git a/src/core/handlers.py b/src/core/handlers.py index 75c372bb..87aaecd5 100644 --- a/src/core/handlers.py +++ b/src/core/handlers.py @@ -10,6 +10,7 @@ import ssl import time from hashlib import sha1, sha512 from gettext import gettext as _ +from os import path from sleekxmpp import InvalidJID from sleekxmpp.stanza import Message @@ -24,7 +25,7 @@ import windows import xhtml import multiuserchat as muc from common import safeJID -from config import config +from config import config, CACHE_DIR from contact import Resource from logger import logger from roster import roster @@ -178,7 +179,11 @@ def on_normal_message(self, message): return self.information('%s says: %s' % (message['from'], message['body']), 'Headline') use_xhtml = config.get('enable_xhtml_im', True) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images') + extract_images = config.get('extract_inline_images', True) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) if not body: return @@ -223,7 +228,9 @@ def on_normal_message(self, message): self.events.trigger('conversation_msg', message, conversation) if not message['body']: return - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) delayed, date = common.find_delayed_tag(message) def try_modify(): @@ -441,7 +448,11 @@ def on_groupchat_message(self, message): self.events.trigger('muc_msg', message, tab) use_xhtml = config.get('enable_xhtml_im', True) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images') + extract_images = config.get('extract_inline_images', True) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) if not body: return @@ -498,7 +509,11 @@ def on_groupchat_private_message(self, message): room_from = jid.bare use_xhtml = config.get('enable_xhtml_im', True) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images') + extract_images = config.get('extract_inline_images', True) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) tab = self.get_tab_by_name(jid.full, tabs.PrivateTab) # get the tab with the private conversation ignore = config.get_by_tabname('ignore_private', False, room_from) if not tab: # It's the first message we receive: create the tab @@ -511,7 +526,9 @@ def on_groupchat_private_message(self, message): self.xmpp.send_message(mto=jid.full, mbody=msg, mtype='chat') return self.events.trigger('private_msg', message, tab) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) if not body or not tab: return replaced_id = message['replace']['id'] diff --git a/src/poezio.py b/src/poezio.py index 1baf10eb..f82f103f 100644 --- a/src/poezio.py +++ b/src/poezio.py @@ -30,6 +30,7 @@ def main(): config.run_cmdline_args(config_path) config.create_global_config() config.check_create_log_dir() + config.check_create_cache_dir() config.setup_logging() config.post_logging_setup() diff --git a/src/xhtml.py b/src/xhtml.py index 48664311..69519f8d 100644 --- a/src/xhtml.py +++ b/src/xhtml.py @@ -12,9 +12,13 @@ xhtml code to shell colors, poezio colors to xhtml code """ -import re +import base64 import curses +import hashlib +import re +from os import path from sleekxmpp.xmlstream import ET +from urllib.parse import unquote from io import BytesIO from xml import sax @@ -178,10 +182,12 @@ colors = { whitespace_re = re.compile(r'\s+') xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]') +xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)') xhtml_simple_attr_re = re.compile(r'\x19\d') -def get_body_from_message_stanza(message, use_xhtml=False): +def get_body_from_message_stanza(message, use_xhtml=False, + tmp_dir=None, extract_images=False): """ Returns a string with xhtml markups converted to poezio colors if there's an xhtml_im element, or @@ -191,7 +197,8 @@ def get_body_from_message_stanza(message, use_xhtml=False): xhtml = message['html'].xml xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body') if xhtml_body: - content = xhtml_to_poezio_colors(xhtml_body) + content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir, + extract_images=extract_images) content = content if content else message['body'] return content or " " return message['body'] @@ -281,7 +288,7 @@ def trim(string): return re.sub(whitespace_re, ' ', string) class XHTMLHandler(sax.ContentHandler): - def __init__(self, force_ns=False): + def __init__(self, force_ns=False, tmp_dir=None, extract_images=False): self.builder = [] self.formatting = [] self.attrs = [] @@ -291,6 +298,9 @@ class XHTMLHandler(sax.ContentHandler): # do not care about xhtml-in namespace self.force_ns = force_ns + self.tmp_dir = tmp_dir + self.extract_images = extract_images + @property def result(self): return ''.join(self.builder).strip() @@ -331,7 +341,22 @@ class XHTMLHandler(sax.ContentHandler): elif name == 'em': self.append_formatting('\x19i') elif name == 'img': - builder.append(trim(attrs['src'])) + if re.match(xhtml_data_re, attrs['src']) and self.extract_images: + type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i] + bin_data = base64.b64decode(unquote(data)) + filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_ + filepath = path.join(self.tmp_dir, filename) + if not path.exists(filepath): + try: + with open(filepath, 'wb') as fd: + fd.write(bin_data) + builder.append('file://%s' % filepath) + except Exception as e: + builder.append('[Error while saving image: %s]' % e) + else: + builder.append('file://%s' % filepath) + else: + builder.append(trim(attrs['src'])) if 'alt' in attrs: builder.append(' (%s)' % trim(attrs['alt'])) elif name == 'ul': @@ -389,13 +414,14 @@ class XHTMLHandler(sax.ContentHandler): if 'title' in attrs: builder.append(' [' + attrs['title'] + ']') -def xhtml_to_poezio_colors(xml, force=False): +def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None): if isinstance(xml, str): xml = xml.encode('utf8') elif not isinstance(xml, bytes): xml = ET.tostring(xml) - handler = XHTMLHandler(force_ns=force) + handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir, + extract_images=extract_images) parser = sax.make_parser() parser.setFeature(sax.handler.feature_namespaces, True) parser.setContentHandler(handler) -- cgit v1.2.3