Move use of html2text to a subprocess call.

(imported from commit 36e8a6f030d75196c28fbdc0e58c6968952d95ff)
This commit is contained in:
Jessica McKellar 2013-11-13 10:45:02 -05:00
parent 0b592a27b1
commit 8382e074fe
3 changed files with 28 additions and 22 deletions

View File

@ -58,6 +58,7 @@ import platform
import logging
from collections import defaultdict
import urllib
import subprocess
# Store an event in the log for re-importing messages
def log_event(event):
@ -2190,3 +2191,26 @@ def alias_for_realm(domain):
return RealmAlias.objects.get(domain=domain)
except RealmAlias.DoesNotExist:
return None
def convert_html_to_markdown(html):
# On Linux, the tool installs as html2markdown, and there's a command called
# html2text that does something totally different. On OSX, the tool installs
# as html2text.
commands = ["html2markdown", "html2text"]
for command in commands:
try:
# A body width of 0 means do not try to wrap the text for us.
p = subprocess.Popen(
["html2text", "--body-width=0"], stdout=subprocess.PIPE,
stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
except OSError:
continue
markdown = p.communicate(input=html)[0].strip()
# We want images to get linked and inline previewed, but html2text will turn
# them into links of the form `![](http://foo.com/image.png)`, which is
# ugly. Run a regex over the resulting description, turning links of the
# form `![](http://foo.com/image.png?12345)` into
# `[image.png](http://foo.com/image.png)`.
return re.sub(r"!\[\]\((\S*)/(\S*)\?(\S*)\)", r"[\2](\1/\2)", markdown)

View File

@ -27,15 +27,13 @@ import sys
from django.conf import settings
from django.core.management.base import BaseCommand
from zerver.lib.actions import decode_email_address
from zerver.lib.actions import decode_email_address, convert_html_to_markdown
from zerver.lib.upload import upload_message_image
from zerver.models import Stream, get_user_profile_by_email, UserProfile
from twisted.internet import protocol, reactor, ssl
from twisted.mail import imap4
import html2text
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../../api"))
import zulip
@ -159,10 +157,7 @@ def extract_body(message):
# If we only have an HTML version, try to make that look nice.
html_content = get_message_part_by_type(message, "text/html")
if html_content:
converter = html2text.HTML2Text()
converter.ignore_links = True
converter.ignore_images = True
return converter.handle(html_content)
return convert_html_to_markdown(html_content)
raise ZulipEmailForwardError("Unable to find plaintext or HTML message body")

View File

@ -6,7 +6,7 @@ from django.conf import settings
from django.views.decorators.csrf import csrf_exempt
from zerver.models import UserProfile, get_client, MAX_SUBJECT_LENGTH, \
get_user_profile_by_email
from zerver.lib.actions import check_send_message
from zerver.lib.actions import check_send_message, convert_html_to_markdown
from zerver.lib.response import json_success, json_error
from zerver.decorator import authenticated_api_view, REQ, \
has_request_variables, json_to_dict, authenticated_rest_api_view, \
@ -15,7 +15,6 @@ from zerver.views import send_message_backend
from django.db.models import Q
from defusedxml.ElementTree import fromstring as xml_fromstring
import html2text
import base64
import logging
@ -646,19 +645,7 @@ def format_freshdesk_property_change_message(ticket, event_info):
def format_freshdesk_ticket_creation_message(ticket):
# They send us the description as HTML.
html2text.BODY_WIDTH = 0 # Do not try to wrap the text for us.
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = False
cleaned_description = converter.handle(ticket.description).strip()
# We want images to get linked and inline previewed, but html2text will turn
# them into links of the form `![](http://foo.com/image.png)`, which is
# ugly. Run a regex over the resulting description, turning links of the
# form `![](http://foo.com/image.png?12345)` into
# `[image.png](http://foo.com/image.png)`.
cleaned_description = re.sub(r"!\[\]\((\S*)/(\S*)\?(\S*)\)", r"[\2](\1/\2)",
cleaned_description)
cleaned_description = convert_html_to_markdown(ticket.description)
content = "%s <%s> created [ticket #%s](%s):\n\n" % (
ticket.requester_name, ticket.requester_email, ticket.id, ticket.url)
content += """~~~ quote