generate_test_data: Replace source with non-Gutenberg text.

It's hard to find literature with the community tone we're going for, that
is consistent with the Zulip code of conduct, etc.

This commit removes the special tooling for Gutenberg plays, and changes the
text to be some mixture of scigen, Communications From Elsewhere,
chat.zulip.org, and various books from the public domain.
This commit is contained in:
Rishi Gupta 2017-08-04 15:31:51 -07:00 committed by Tim Abbott
parent 4350b52740
commit 4bf8ac2498
3 changed files with 207 additions and 3494 deletions

View File

@ -1,22 +0,0 @@
from unittest import TestCase
from zerver.lib.generate_test_data import remove_actions
class CheckRemoveActions(TestCase):
def test_remove_leading_action(self):
# type: () -> None
string = "[Walks to the dresser.] This looks interesting."
result = remove_actions(string)
self.assertEqual(result, " This looks interesting.")
def test_remove_trailingaction(self):
# type: () -> None
string = "This looks interesting. [Walks to the dresser.]"
result = remove_actions(string)
self.assertEqual(result, "This looks interesting. ")
def test_remove_middle_action(self):
# type: () -> None
string = "This looks [Walks to the dresser.] interesting."
result = remove_actions(string)
self.assertEqual(result, "This looks interesting.")

View File

@ -54,7 +54,6 @@ def parse_file(config, gens, corpus_file):
with open(corpus_file, "r") as infile: with open(corpus_file, "r") as infile:
# OUR DATA: we need to seperate the person talking and what they say # OUR DATA: we need to seperate the person talking and what they say
paragraphs = remove_line_breaks(infile) paragraphs = remove_line_breaks(infile)
paragraphs = process_dialog(paragraphs)
paragraphs = add_flair(paragraphs, gens) paragraphs = add_flair(paragraphs, gens)
return paragraphs return paragraphs
@ -150,37 +149,6 @@ def add_link(text, link):
return " ".join(vals) return " ".join(vals)
def remove_actions(line):
# type: (str) -> str
# Sure, we can regex, but why hassle with that?
newVal = line
if "[" in line:
posOne = line.index("[")
posTwo = line.index("]")
if posTwo < len(line):
newVal = line[:posOne] + line[posTwo + 1:]
else:
newVal = line[:posOne]
if newVal != line:
newVal = remove_actions(newVal)
return newVal
def process_dialog(paragraphs):
# type: (List[str]) -> List[str]
results = []
for dialog in paragraphs:
tup_result = get_dialog(dialog)
if tup_result is not None:
if tup_result[0] is not None:
results.append(tup_result)
return results
def remove_line_breaks(fh): def remove_line_breaks(fh):
# type: (Any) -> List[str] # type: (Any) -> List[str]
@ -191,10 +159,6 @@ def remove_line_breaks(fh):
for line in fh: for line in fh:
text = line.strip() text = line.strip()
# this is the standard notification to mark the end of Gutenberg stuff
if text.startswith("***END OF THE PROJECT GUTENBERG"):
break
if text != "": if text != "":
para.append(text) para.append(text)
else: else:
@ -207,24 +171,6 @@ def remove_line_breaks(fh):
return results return results
def get_dialog(line):
# type: (str) -> Any
# We've got a line from the play,
# let's see if it's a line or dialog or something else.
actor = ""
if '.' in line:
strpos = line.index('.')
if strpos > 0:
actor = line[:strpos]
vals = actor.split()
if len(vals) < 2:
return remove_actions(line[strpos + 2:].strip())
else:
# no actor, so not a line of dialog
return None
def write_file(paragraphs, filename): def write_file(paragraphs, filename):
# type: (List[str], str) -> None # type: (List[str], str) -> None

File diff suppressed because it is too large Load Diff