2017-05-23 23:57:42 +02:00
|
|
|
import itertools
|
2019-06-09 20:27:12 +02:00
|
|
|
import os
|
2020-06-11 00:54:34 +02:00
|
|
|
import random
|
|
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
|
|
import ujson
|
2019-06-09 20:27:12 +02:00
|
|
|
|
|
|
|
from scripts.lib.zulip_tools import get_or_create_dev_uuid_var_path
|
2017-05-23 23:57:42 +02:00
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def load_config() -> Dict[str, Any]:
|
2020-04-09 21:51:58 +02:00
|
|
|
with open("zerver/tests/fixtures/config.generate_data.json") as infile:
|
2017-05-23 23:57:42 +02:00
|
|
|
config = ujson.load(infile)
|
|
|
|
|
|
|
|
return config
|
|
|
|
|
2020-05-05 07:55:47 +02:00
|
|
|
def generate_topics(num_topics: int) -> List[str]:
|
|
|
|
config = load_config()["gen_fodder"]
|
|
|
|
|
|
|
|
topics = []
|
|
|
|
# Make single word topics account for 30% of total topics.
|
|
|
|
# Single word topics are most common, thus
|
|
|
|
# it is important we test on it.
|
|
|
|
num_single_word_topics = num_topics // 3
|
|
|
|
for _ in itertools.repeat(None, num_single_word_topics):
|
|
|
|
topics.append(random.choice(config["nouns"]))
|
|
|
|
|
|
|
|
sentence = ["adjectives", "nouns", "connectors", "verbs", "adverbs"]
|
|
|
|
for pos in sentence:
|
|
|
|
# Add an empty string so that we can generate variable length topics.
|
|
|
|
config[pos].append("")
|
|
|
|
|
|
|
|
for _ in itertools.repeat(None, num_topics - num_single_word_topics):
|
|
|
|
generated_topic = [random.choice(config[pos]) for pos in sentence]
|
|
|
|
topic = " ".join(filter(None, generated_topic))
|
|
|
|
topics.append(topic)
|
|
|
|
|
|
|
|
return topics
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def load_generators(config: Dict[str, Any]) -> Dict[str, Any]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
results = {}
|
|
|
|
cfg = config["gen_fodder"]
|
|
|
|
|
|
|
|
results["nouns"] = itertools.cycle(cfg["nouns"])
|
|
|
|
results["adjectives"] = itertools.cycle(cfg["adjectives"])
|
|
|
|
results["connectors"] = itertools.cycle(cfg["connectors"])
|
|
|
|
results["verbs"] = itertools.cycle(cfg["verbs"])
|
|
|
|
results["adverbs"] = itertools.cycle(cfg["adverbs"])
|
|
|
|
results["emojis"] = itertools.cycle(cfg["emoji"])
|
|
|
|
results["links"] = itertools.cycle(cfg["links"])
|
|
|
|
|
|
|
|
results["maths"] = itertools.cycle(cfg["maths"])
|
|
|
|
results["inline-code"] = itertools.cycle(cfg["inline-code"])
|
|
|
|
results["code-blocks"] = itertools.cycle(cfg["code-blocks"])
|
|
|
|
results["quote-blocks"] = itertools.cycle(cfg["quote-blocks"])
|
|
|
|
|
|
|
|
results["lists"] = itertools.cycle(cfg["lists"])
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def parse_file(config: Dict[str, Any], gens: Dict[str, Any], corpus_file: str) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# First, load the entire file into a dictionary,
|
|
|
|
# then apply our custom filters to it as needed.
|
|
|
|
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
paragraphs: List[str] = []
|
2017-05-23 23:57:42 +02:00
|
|
|
|
2020-04-09 21:51:58 +02:00
|
|
|
with open(corpus_file) as infile:
|
2017-11-09 16:26:38 +01:00
|
|
|
# OUR DATA: we need to separate the person talking and what they say
|
2017-05-23 23:57:42 +02:00
|
|
|
paragraphs = remove_line_breaks(infile)
|
|
|
|
paragraphs = add_flair(paragraphs, gens)
|
|
|
|
|
|
|
|
return paragraphs
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def get_flair_gen(length: int) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# Grab the percentages from the config file
|
|
|
|
# create a list that we can consume that will guarantee the distribution
|
|
|
|
result = []
|
|
|
|
|
|
|
|
for k, v in config["dist_percentages"].items():
|
|
|
|
result.extend([k] * int(v * length / 100))
|
|
|
|
|
|
|
|
result.extend(["None"] * (length - len(result)))
|
|
|
|
|
|
|
|
random.shuffle(result)
|
|
|
|
return result
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_flair(paragraphs: List[str], gens: Dict[str, Any]) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# roll the dice and see what kind of flair we should add, if any
|
|
|
|
results = []
|
|
|
|
|
|
|
|
flair = get_flair_gen(len(paragraphs))
|
|
|
|
|
|
|
|
for i in range(len(paragraphs)):
|
|
|
|
key = flair[i]
|
|
|
|
if key == "None":
|
|
|
|
txt = paragraphs[i]
|
|
|
|
elif key == "italic":
|
|
|
|
txt = add_md("*", paragraphs[i])
|
|
|
|
elif key == "bold":
|
|
|
|
txt = add_md("**", paragraphs[i])
|
|
|
|
elif key == "strike-thru":
|
|
|
|
txt = add_md("~~", paragraphs[i])
|
|
|
|
elif key == "quoted":
|
|
|
|
txt = ">" + paragraphs[i]
|
|
|
|
elif key == "quote-block":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["quote-blocks"])
|
|
|
|
elif key == "inline-code":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["inline-code"])
|
|
|
|
elif key == "code-block":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["code-blocks"])
|
|
|
|
elif key == "math":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["maths"])
|
|
|
|
elif key == "list":
|
|
|
|
txt = paragraphs[i] + "\n" + next(gens["lists"])
|
|
|
|
elif key == "emoji":
|
|
|
|
txt = add_emoji(paragraphs[i], next(gens["emojis"]))
|
|
|
|
elif key == "link":
|
|
|
|
txt = add_link(paragraphs[i], next(gens["links"]))
|
|
|
|
elif key == "picture":
|
|
|
|
txt = txt # TODO: implement pictures
|
|
|
|
|
|
|
|
results.append(txt)
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_md(mode: str, text: str) -> str:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# mode means: bold, italic, etc.
|
|
|
|
# to add a list at the end of a paragraph, * iterm one\n * item two
|
|
|
|
|
|
|
|
# find out how long the line is, then insert the mode before the end
|
|
|
|
|
|
|
|
vals = text.split()
|
|
|
|
start = random.randrange(len(vals))
|
|
|
|
end = random.randrange(len(vals) - start) + start
|
|
|
|
vals[start] = mode + vals[start]
|
|
|
|
vals[end] = vals[end] + mode
|
|
|
|
|
|
|
|
return " ".join(vals).strip()
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_emoji(text: str, emoji: str) -> str:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
vals = text.split()
|
|
|
|
start = random.randrange(len(vals))
|
|
|
|
|
|
|
|
vals[start] = vals[start] + " " + emoji + " "
|
|
|
|
return " ".join(vals)
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def add_link(text: str, link: str) -> str:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
vals = text.split()
|
|
|
|
start = random.randrange(len(vals))
|
|
|
|
|
|
|
|
vals[start] = vals[start] + " " + link + " "
|
|
|
|
|
|
|
|
return " ".join(vals)
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def remove_line_breaks(fh: Any) -> List[str]:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
# We're going to remove line breaks from paragraphs
|
|
|
|
results = [] # save the dialogs as tuples with (author, dialog)
|
|
|
|
|
|
|
|
para = [] # we'll store the lines here to form a paragraph
|
|
|
|
|
|
|
|
for line in fh:
|
|
|
|
text = line.strip()
|
|
|
|
if text != "":
|
|
|
|
para.append(text)
|
|
|
|
else:
|
2017-08-05 00:28:08 +02:00
|
|
|
if para:
|
2017-05-23 23:57:42 +02:00
|
|
|
results.append(" ".join(para))
|
|
|
|
# reset the paragraph
|
|
|
|
para = []
|
2017-08-05 00:28:08 +02:00
|
|
|
if para:
|
|
|
|
results.append(" ".join(para))
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def write_file(paragraphs: List[str], filename: str) -> None:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
with open(filename, "w") as outfile:
|
|
|
|
outfile.write(ujson.dumps(paragraphs))
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def create_test_data() -> None:
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
gens = load_generators(config) # returns a dictionary of generators
|
|
|
|
|
|
|
|
paragraphs = parse_file(config, gens, config["corpus"]["filename"])
|
|
|
|
|
2019-06-09 20:27:12 +02:00
|
|
|
write_file(paragraphs, os.path.join(get_or_create_dev_uuid_var_path('test-backend'),
|
|
|
|
"test_messages.json"))
|
2017-05-23 23:57:42 +02:00
|
|
|
|
2019-12-13 20:52:23 +01:00
|
|
|
config = load_config()
|
2017-05-23 23:57:42 +02:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2019-12-13 20:52:23 +01:00
|
|
|
create_test_data()
|