rss-bot: Add --unwrap, --math options.

These are for processing arXiv API results.
This commit is contained in:
Reid Barton 2017-05-25 14:58:24 -04:00 committed by Tim Abbott
parent 8e978df957
commit 461856dd56
1 changed files with 26 additions and 1 deletions

View File

@ -31,6 +31,7 @@ from six.moves.html_parser import HTMLParser
import logging import logging
import optparse import optparse
import os import os
import re
import sys import sys
import time import time
from six.moves import urllib from six.moves import urllib
@ -82,6 +83,16 @@ parser.add_option('--feed-file',
help='The file containing a list of RSS feed URLs to follow, one URL per line', help='The file containing a list of RSS feed URLs to follow, one URL per line',
default=os.path.join(RSS_DATA_DIR, "rss-feeds"), default=os.path.join(RSS_DATA_DIR, "rss-feeds"),
action='store') action='store')
parser.add_option('--unwrap',
dest='unwrap',
action='store_true',
help='Convert word-wrapped paragraphs into single lines',
default=False)
parser.add_option('--math',
dest='math',
action='store_true',
help='Convert $ to $$ (for KaTeX processing)',
default=False)
parser.add_option_group(zulip.generate_option_group(parser)) parser.add_option_group(zulip.generate_option_group(parser))
(opts, args) = parser.parse_args() # type: Tuple[Any, List[str]] (opts, args) = parser.parse_args() # type: Tuple[Any, List[str]]
@ -147,6 +158,12 @@ def compute_entry_hash(entry):
entry_id = entry.get("id", entry.get("link")) entry_id = entry.get("id", entry.get("link"))
return hashlib.md5(entry_id + str(entry_time)).hexdigest() return hashlib.md5(entry_id + str(entry_time)).hexdigest()
def unwrap_text(body):
# type: (str) -> str
# Replace \n by space if it is preceded and followed by a non-\n.
# Example: '\na\nb\nc\n\nd\n' -> '\na b c\n\nd\n'
return re.sub('(?<=[^\n])\n(?=[^\n])', ' ', body)
def elide_subject(subject): def elide_subject(subject):
# type: (str) -> str # type: (str) -> str
MAX_TOPIC_LENGTH = 60 MAX_TOPIC_LENGTH = 60
@ -156,10 +173,18 @@ def elide_subject(subject):
def send_zulip(entry, feed_name): def send_zulip(entry, feed_name):
# type: (Any, str) -> Dict[str, Any] # type: (Any, str) -> Dict[str, Any]
body = entry.summary # type: str
if opts.unwrap:
body = unwrap_text(body)
content = "**[%s](%s)**\n%s\n%s" % (entry.title, content = "**[%s](%s)**\n%s\n%s" % (entry.title,
entry.link, entry.link,
strip_tags(entry.summary), strip_tags(body),
entry.link) # type: str entry.link) # type: str
if opts.math:
content = content.replace('$', '$$')
message = {"type": "stream", message = {"type": "stream",
"sender": opts.zulip_email, "sender": opts.zulip_email,
"to": opts.stream, "to": opts.stream,