2017-05-23 23:57:42 +02:00
|
|
|
import itertools
|
|
|
|
import ujson
|
|
|
|
import random
|
2019-02-02 23:53:55 +01:00
|
|
|
from typing import List, Dict, Any
|
2017-05-23 23:57:42 +02:00
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def load_config() -> Dict[str, Any]:
|
2018-04-19 20:17:24 +02:00
|
|
|
with open("zerver/tests/fixtures/config.generate_data.json", "r") as infile:
|
2017-05-23 23:57:42 +02:00
|
|
|
config = ujson.load(infile)
|
|
|
|
|
|
|
|
return config
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def get_stream_title(gens: Dict[str, Any]) -> str:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
return next(gens["adjectives"]) + " " + next(gens["nouns"]) + " " + \
|
|
|
|
next(gens["connectors"]) + " " + next(gens["verbs"]) + " " + \
|
|
|
|
next(gens["adverbs"])
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def load_generators(config: Dict[str, Any]) -> Dict[str, Any]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
results = {}
|
|
|
|
cfg = config["gen_fodder"]
|
|
|
|
|
|
|
|
results["nouns"] = itertools.cycle(cfg["nouns"])
|
|
|
|
results["adjectives"] = itertools.cycle(cfg["adjectives"])
|
|
|
|
results["connectors"] = itertools.cycle(cfg["connectors"])
|
|
|
|
results["verbs"] = itertools.cycle(cfg["verbs"])
|
|
|
|
results["adverbs"] = itertools.cycle(cfg["adverbs"])
|
|
|
|
results["emojis"] = itertools.cycle(cfg["emoji"])
|
|
|
|
results["links"] = itertools.cycle(cfg["links"])
|
|
|
|
|
|
|
|
results["maths"] = itertools.cycle(cfg["maths"])
|
|
|
|
results["inline-code"] = itertools.cycle(cfg["inline-code"])
|
|
|
|
results["code-blocks"] = itertools.cycle(cfg["code-blocks"])
|
|
|
|
results["quote-blocks"] = itertools.cycle(cfg["quote-blocks"])
|
|
|
|
|
|
|
|
results["lists"] = itertools.cycle(cfg["lists"])
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def parse_file(config: Dict[str, Any], gens: Dict[str, Any], corpus_file: str) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# First, load the entire file into a dictionary,
|
|
|
|
# then apply our custom filters to it as needed.
|
|
|
|
|
|
|
|
paragraphs = [] # type: List[str]
|
|
|
|
|
|
|
|
with open(corpus_file, "r") as infile:
|
2017-11-09 16:26:38 +01:00
|
|
|
# OUR DATA: we need to separate the person talking and what they say
|
2017-05-23 23:57:42 +02:00
|
|
|
paragraphs = remove_line_breaks(infile)
|
|
|
|
paragraphs = add_flair(paragraphs, gens)
|
|
|
|
|
|
|
|
return paragraphs
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def get_flair_gen(length: int) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# Grab the percentages from the config file
|
|
|
|
# create a list that we can consume that will guarantee the distribution
|
|
|
|
result = []
|
|
|
|
|
|
|
|
for k, v in config["dist_percentages"].items():
|
|
|
|
result.extend([k] * int(v * length / 100))
|
|
|
|
|
|
|
|
result.extend(["None"] * (length - len(result)))
|
|
|
|
|
|
|
|
random.shuffle(result)
|
|
|
|
return result
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_flair(paragraphs: List[str], gens: Dict[str, Any]) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# roll the dice and see what kind of flair we should add, if any
|
|
|
|
results = []
|
|
|
|
|
|
|
|
flair = get_flair_gen(len(paragraphs))
|
|
|
|
|
|
|
|
for i in range(len(paragraphs)):
|
|
|
|
key = flair[i]
|
|
|
|
if key == "None":
|
|
|
|
txt = paragraphs[i]
|
|
|
|
elif key == "italic":
|
|
|
|
txt = add_md("*", paragraphs[i])
|
|
|
|
elif key == "bold":
|
|
|
|
txt = add_md("**", paragraphs[i])
|
|
|
|
elif key == "strike-thru":
|
|
|
|
txt = add_md("~~", paragraphs[i])
|
|
|
|
elif key == "quoted":
|
|
|
|
txt = ">" + paragraphs[i]
|
|
|
|
elif key == "quote-block":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["quote-blocks"])
|
|
|
|
elif key == "inline-code":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["inline-code"])
|
|
|
|
elif key == "code-block":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["code-blocks"])
|
|
|
|
elif key == "math":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["maths"])
|
|
|
|
elif key == "list":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["lists"])
|
|
|
|
elif key == "emoji":
|
|
|
|
txt = add_emoji(paragraphs[i], next(gens["emojis"]))
|
|
|
|
elif key == "link":
|
|
|
|
txt = add_link(paragraphs[i], next(gens["links"]))
|
|
|
|
elif key == "picture":
|
|
|
|
txt = txt # TODO: implement pictures
|
|
|
|
|
|
|
|
results.append(txt)
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_md(mode: str, text: str) -> str:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# mode means: bold, italic, etc.
|
|
|
|
# to add a list at the end of a paragraph, * iterm one\n * item two
|
|
|
|
|
|
|
|
# find out how long the line is, then insert the mode before the end
|
|
|
|
|
|
|
|
vals = text.split()
|
|
|
|
start = random.randrange(len(vals))
|
|
|
|
end = random.randrange(len(vals) - start) + start
|
|
|
|
vals[start] = mode + vals[start]
|
|
|
|
vals[end] = vals[end] + mode
|
|
|
|
|
|
|
|
return " ".join(vals).strip()
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_emoji(text: str, emoji: str) -> str:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
vals = text.split()
|
|
|
|
start = random.randrange(len(vals))
|
|
|
|
|
|
|
|
vals[start] = vals[start] + " " + emoji + " "
|
|
|
|
return " ".join(vals)
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_link(text: str, link: str) -> str:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
vals = text.split()
|
|
|
|
start = random.randrange(len(vals))
|
|
|
|
|
|
|
|
vals[start] = vals[start] + " " + link + " "
|
|
|
|
|
|
|
|
return " ".join(vals)
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def remove_line_breaks(fh: Any) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# We're going to remove line breaks from paragraphs
|
|
|
|
results = [] # save the dialogs as tuples with (author, dialog)
|
|
|
|
|
|
|
|
para = [] # we'll store the lines here to form a paragraph
|
|
|
|
|
|
|
|
for line in fh:
|
|
|
|
text = line.strip()
|
|
|
|
if text != "":
|
|
|
|
para.append(text)
|
|
|
|
else:
|
2017-08-05 00:28:08 +02:00
|
|
|
if para:
|
2017-05-23 23:57:42 +02:00
|
|
|
results.append(" ".join(para))
|
|
|
|
# reset the paragraph
|
|
|
|
para = []
|
2017-08-05 00:28:08 +02:00
|
|
|
if para:
|
|
|
|
results.append(" ".join(para))
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def write_file(paragraphs: List[str], filename: str) -> None:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
with open(filename, "w") as outfile:
|
|
|
|
outfile.write(ujson.dumps(paragraphs))
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def create_test_data() -> None:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
gens = load_generators(config) # returns a dictionary of generators
|
|
|
|
|
|
|
|
paragraphs = parse_file(config, gens, config["corpus"]["filename"])
|
|
|
|
|
|
|
|
write_file(paragraphs, "var/test_messages.json")
|
|
|
|
|
|
|
|
config = load_config() # type: Dict[str, Any]
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
create_test_data() # type: () -> ()
|