Don't puke on non-ASCII characters when shelling out to html2text.

(imported from commit a407c9fb2a090075d5e26b5db00388f4f81de1f5)
This commit is contained in:
Jessica McKellar 2013-11-15 11:10:45 -05:00
parent 545e2539f8
commit b0209fec05
1 changed files with 3 additions and 2 deletions

View File

@ -2227,10 +2227,11 @@ def convert_html_to_markdown(html):
except OSError:
continue
markdown = p.communicate(input=html)[0].strip()
markdown = p.communicate(input=html.encode("utf-8"))[0].strip()
# We want images to get linked and inline previewed, but html2text will turn
# them into links of the form `![](http://foo.com/image.png)`, which is
# ugly. Run a regex over the resulting description, turning links of the
# form `![](http://foo.com/image.png?12345)` into
# `[image.png](http://foo.com/image.png)`.
return re.sub(r"!\[\]\((\S*)/(\S*)\?(\S*)\)", r"[\2](\1/\2)", markdown)
return re.sub(r"!\[\]\((\S*)/(\S*)\?(\S*)\)",
r"[\2](\1/\2)", markdown).decode("utf-8")