zulip/zerver/lib/generate_test_data.py

import itertools
import ujson
import random
from typing import List, Dict, Any
import os

from scripts.lib.zulip_tools import get_or_create_dev_uuid_var_path

def load_config() -> Dict[str, Any]:
    with open("zerver/tests/fixtures/config.generate_data.json") as infile:
        config = ujson.load(infile)

    return config

def generate_topics(num_topics: int) -> List[str]:
    config = load_config()["gen_fodder"]

    topics = []
    # Make single word topics account for 30% of total topics.
    # Single word topics are most common, thus
    # it is important we test on it.
    num_single_word_topics = num_topics // 3
    for _ in itertools.repeat(None, num_single_word_topics):
        topics.append(random.choice(config["nouns"]))

    sentence = ["adjectives", "nouns", "connectors", "verbs", "adverbs"]
    for pos in sentence:
        # Add an empty string so that we can generate variable length topics.
        config[pos].append("")

    for _ in itertools.repeat(None, num_topics - num_single_word_topics):
        generated_topic = [random.choice(config[pos]) for pos in sentence]
        topic = " ".join(filter(None, generated_topic))
        topics.append(topic)

    return topics

def load_generators(config: Dict[str, Any]) -> Dict[str, Any]:

    results = {}
    cfg = config["gen_fodder"]

    results["nouns"] = itertools.cycle(cfg["nouns"])
    results["adjectives"] = itertools.cycle(cfg["adjectives"])
    results["connectors"] = itertools.cycle(cfg["connectors"])
    results["verbs"] = itertools.cycle(cfg["verbs"])
    results["adverbs"] = itertools.cycle(cfg["adverbs"])
    results["emojis"] = itertools.cycle(cfg["emoji"])
    results["links"] = itertools.cycle(cfg["links"])

    results["maths"] = itertools.cycle(cfg["maths"])
    results["inline-code"] = itertools.cycle(cfg["inline-code"])
    results["code-blocks"] = itertools.cycle(cfg["code-blocks"])
    results["quote-blocks"] = itertools.cycle(cfg["quote-blocks"])

    results["lists"] = itertools.cycle(cfg["lists"])

    return results

def parse_file(config: Dict[str, Any], gens: Dict[str, Any], corpus_file: str) -> List[str]:

    # First, load the entire file into a dictionary,
    # then apply our custom filters to it as needed.

    paragraphs: List[str] = []

    with open(corpus_file) as infile:
        # OUR DATA: we need to separate the person talking and what they say
        paragraphs = remove_line_breaks(infile)
        paragraphs = add_flair(paragraphs, gens)

    return paragraphs

def get_flair_gen(length: int) -> List[str]:

    # Grab the percentages from the config file
    # create a list that we can consume that will guarantee the distribution
    result = []

    for k, v in config["dist_percentages"].items():
        result.extend([k] * int(v * length / 100))

    result.extend(["None"] * (length - len(result)))

    random.shuffle(result)
    return result

def add_flair(paragraphs: List[str], gens: Dict[str, Any]) -> List[str]:

    # roll the dice and see what kind of flair we should add, if any
    results = []

    flair = get_flair_gen(len(paragraphs))

    for i in range(len(paragraphs)):
        key = flair[i]
        if key == "None":
            txt = paragraphs[i]
        elif key == "italic":
            txt = add_md("*", paragraphs[i])
        elif key == "bold":
            txt = add_md("**", paragraphs[i])
        elif key == "strike-thru":
            txt = add_md("~~", paragraphs[i])
        elif key == "quoted":
            txt = ">" + paragraphs[i]
        elif key == "quote-block":
            txt = paragraphs[i] + "\n" + next(gens["quote-blocks"])
        elif key == "inline-code":
            txt = paragraphs[i] + "\n" + next(gens["inline-code"])
        elif key == "code-block":
            txt = paragraphs[i] + "\n" + next(gens["code-blocks"])
        elif key == "math":
            txt = paragraphs[i] + "\n" + next(gens["maths"])
        elif key == "list":
            txt = paragraphs[i] + "\n" + next(gens["lists"])
        elif key == "emoji":
            txt = add_emoji(paragraphs[i], next(gens["emojis"]))
        elif key == "link":
            txt = add_link(paragraphs[i], next(gens["links"]))
        elif key == "picture":
            txt = txt      # TODO: implement pictures

        results.append(txt)

    return results

def add_md(mode: str, text: str) -> str:

    # mode means: bold, italic, etc.
    # to add a list at the end of a paragraph, * iterm one\n * item two

    # find out how long the line is, then insert the mode before the end

    vals = text.split()
    start = random.randrange(len(vals))
    end = random.randrange(len(vals) - start) + start
    vals[start] = mode + vals[start]
    vals[end] = vals[end] + mode

    return " ".join(vals).strip()

def add_emoji(text: str, emoji: str) -> str:

    vals = text.split()
    start = random.randrange(len(vals))

    vals[start] = vals[start] + " " + emoji + " "
    return " ".join(vals)

def add_link(text: str, link: str) -> str:

    vals = text.split()
    start = random.randrange(len(vals))

    vals[start] = vals[start] + " " + link + " "

    return " ".join(vals)

def remove_line_breaks(fh: Any) -> List[str]:

    # We're going to remove line breaks from paragraphs
    results = []    # save the dialogs as tuples with (author, dialog)

    para = []   # we'll store the lines here to form a paragraph

    for line in fh:
        text = line.strip()
        if text != "":
            para.append(text)
        else:
            if para:
                results.append(" ".join(para))
            # reset the paragraph
            para = []
    if para:
        results.append(" ".join(para))

    return results

def write_file(paragraphs: List[str], filename: str) -> None:

    with open(filename, "w") as outfile:
        outfile.write(ujson.dumps(paragraphs))

def create_test_data() -> None:

    gens = load_generators(config)   # returns a dictionary of generators

    paragraphs = parse_file(config, gens, config["corpus"]["filename"])

    write_file(paragraphs, os.path.join(get_or_create_dev_uuid_var_path('test-backend'),
                                        "test_messages.json"))

config = load_config()

if __name__ == "__main__":
    create_test_data()
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00			`import itertools`
			`import ujson`
			`import random`
zerver core: Remove unused imports. Signed-off-by: Anders Kaseorg <andersk@mit.edu> 2019-02-02 23:53:55 +01:00			`from typing import List, Dict, Any`
test-backend: Move `test_messages.json` to `var/<uuid>/test-backend`. 2019-06-09 20:27:12 +02:00			`import os`

			`from scripts.lib.zulip_tools import get_or_create_dev_uuid_var_path`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def load_config() -> Dict[str, Any]:`
python: Modernize legacy Python 2 syntax with pyupgrade. Generated by `pyupgrade --py3-plus --keep-percent-format` on all our Python code except `zthumbor` and `zulip-ec2-configure-interfaces`, followed by manual indentation fixes. Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2020-04-09 21:51:58 +02:00			`with open("zerver/tests/fixtures/config.generate_data.json") as infile:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00			`config = ujson.load(infile)`

			`return config`

populate_db: Generate topics using config fixture. Instread of using stream_name + Intergers as topics, we now generate topics using pos in `config.generate_data.json`. This helps us create and test more realistic topics. 2020-05-05 07:55:47 +02:00			`def generate_topics(num_topics: int) -> List[str]:`
			`config = load_config()["gen_fodder"]`

			`topics = []`
			`# Make single word topics account for 30% of total topics.`
			`# Single word topics are most common, thus`
			`# it is important we test on it.`
			`num_single_word_topics = num_topics // 3`
			`for _ in itertools.repeat(None, num_single_word_topics):`
			`topics.append(random.choice(config["nouns"]))`

			`sentence = ["adjectives", "nouns", "connectors", "verbs", "adverbs"]`
			`for pos in sentence:`
			`# Add an empty string so that we can generate variable length topics.`
			`config[pos].append("")`

			`for _ in itertools.repeat(None, num_topics - num_single_word_topics):`
			`generated_topic = [random.choice(config[pos]) for pos in sentence]`
			`topic = " ".join(filter(None, generated_topic))`
			`topics.append(topic)`

			`return topics`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def load_generators(config: Dict[str, Any]) -> Dict[str, Any]:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`results = {}`
			`cfg = config["gen_fodder"]`

			`results["nouns"] = itertools.cycle(cfg["nouns"])`
			`results["adjectives"] = itertools.cycle(cfg["adjectives"])`
			`results["connectors"] = itertools.cycle(cfg["connectors"])`
			`results["verbs"] = itertools.cycle(cfg["verbs"])`
			`results["adverbs"] = itertools.cycle(cfg["adverbs"])`
			`results["emojis"] = itertools.cycle(cfg["emoji"])`
			`results["links"] = itertools.cycle(cfg["links"])`

			`results["maths"] = itertools.cycle(cfg["maths"])`
			`results["inline-code"] = itertools.cycle(cfg["inline-code"])`
			`results["code-blocks"] = itertools.cycle(cfg["code-blocks"])`
			`results["quote-blocks"] = itertools.cycle(cfg["quote-blocks"])`

			`results["lists"] = itertools.cycle(cfg["lists"])`

			`return results`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def parse_file(config: Dict[str, Any], gens: Dict[str, Any], corpus_file: str) -> List[str]:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`# First, load the entire file into a dictionary,`
			`# then apply our custom filters to it as needed.`

python: Convert assignment type annotations to Python 3.6 style. This commit was split by tabbott; this piece covers the vast majority of files in Zulip, but excludes scripts/, tools/, and puppet/ to help ensure we at least show the right error messages for Xenial systems. We can likely further refine the remaining pieces with some testing. Generated by com2ann, with whitespace fixes and various manual fixes for runtime issues: - invoiced_through: Optional[LicenseLedger] = models.ForeignKey( + invoiced_through: Optional["LicenseLedger"] = models.ForeignKey( -_apns_client: Optional[APNsClient] = None +_apns_client: Optional["APNsClient"] = None - notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) - signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) + notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) + signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) - author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE) + author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE) - bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL) + bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL) - default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) - default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) + default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) + default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) -descriptors_by_handler_id: Dict[int, ClientDescriptor] = {} +descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {} -worker_classes: Dict[str, Type[QueueProcessingWorker]] = {} -queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {} +worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {} +queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {} -AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None +AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2020-04-22 01:09:50 +02:00			`paragraphs: List[str] = []`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
python: Modernize legacy Python 2 syntax with pyupgrade. Generated by `pyupgrade --py3-plus --keep-percent-format` on all our Python code except `zthumbor` and `zulip-ec2-configure-interfaces`, followed by manual indentation fixes. Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2020-04-09 21:51:58 +02:00			`with open(corpus_file) as infile:`
Fix various typos. The typos and their corrections were found with the aid of https://github.com/lucasdemarchi/codespell. 2017-11-09 16:26:38 +01:00			`# OUR DATA: we need to separate the person talking and what they say`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00			`paragraphs = remove_line_breaks(infile)`
			`paragraphs = add_flair(paragraphs, gens)`

			`return paragraphs`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def get_flair_gen(length: int) -> List[str]:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`# Grab the percentages from the config file`
			`# create a list that we can consume that will guarantee the distribution`
			`result = []`

			`for k, v in config["dist_percentages"].items():`
			`result.extend([k] * int(v * length / 100))`

			`result.extend(["None"] * (length - len(result)))`

			`random.shuffle(result)`
			`return result`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def add_flair(paragraphs: List[str], gens: Dict[str, Any]) -> List[str]:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`# roll the dice and see what kind of flair we should add, if any`
			`results = []`

			`flair = get_flair_gen(len(paragraphs))`

			`for i in range(len(paragraphs)):`
			`key = flair[i]`
			`if key == "None":`
			`txt = paragraphs[i]`
			`elif key == "italic":`
			`txt = add_md("*", paragraphs[i])`
			`elif key == "bold":`
			`txt = add_md("**", paragraphs[i])`
			`elif key == "strike-thru":`
			`txt = add_md("~~", paragraphs[i])`
			`elif key == "quoted":`
			`txt = ">" + paragraphs[i]`
			`elif key == "quote-block":`
			`txt = paragraphs[i] + "\n" + next(gens["quote-blocks"])`
			`elif key == "inline-code":`
			`txt = paragraphs[i] + "\n" + next(gens["inline-code"])`
			`elif key == "code-block":`
			`txt = paragraphs[i] + "\n" + next(gens["code-blocks"])`
			`elif key == "math":`
			`txt = paragraphs[i] + "\n" + next(gens["maths"])`
			`elif key == "list":`
			`txt = paragraphs[i] + "\n" + next(gens["lists"])`
			`elif key == "emoji":`
			`txt = add_emoji(paragraphs[i], next(gens["emojis"]))`
			`elif key == "link":`
			`txt = add_link(paragraphs[i], next(gens["links"]))`
			`elif key == "picture":`
			`txt = txt # TODO: implement pictures`

			`results.append(txt)`

			`return results`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def add_md(mode: str, text: str) -> str:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`# mode means: bold, italic, etc.`
			`# to add a list at the end of a paragraph, * iterm one\n * item two`

			`# find out how long the line is, then insert the mode before the end`

			`vals = text.split()`
			`start = random.randrange(len(vals))`
			`end = random.randrange(len(vals) - start) + start`
			`vals[start] = mode + vals[start]`
			`vals[end] = vals[end] + mode`

			`return " ".join(vals).strip()`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def add_emoji(text: str, emoji: str) -> str:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`vals = text.split()`
			`start = random.randrange(len(vals))`

			`vals[start] = vals[start] + " " + emoji + " "`
			`return " ".join(vals)`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def add_link(text: str, link: str) -> str:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`vals = text.split()`
			`start = random.randrange(len(vals))`

			`vals[start] = vals[start] + " " + link + " "`

			`return " ".join(vals)`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def remove_line_breaks(fh: Any) -> List[str]:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`# We're going to remove line breaks from paragraphs`
			`results = [] # save the dialogs as tuples with (author, dialog)`

			`para = [] # we'll store the lines here to form a paragraph`

			`for line in fh:`
			`text = line.strip()`
			`if text != "":`
			`para.append(text)`
			`else:`
generate_test_data: Fix remove_line_breaks algorithm. 2017-08-05 00:28:08 +02:00			`if para:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00			`results.append(" ".join(para))`
			`# reset the paragraph`
			`para = []`
generate_test_data: Fix remove_line_breaks algorithm. 2017-08-05 00:28:08 +02:00			`if para:`
			`results.append(" ".join(para))`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`return results`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def write_file(paragraphs: List[str], filename: str) -> None:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`with open(filename, "w") as outfile:`
			`outfile.write(ujson.dumps(paragraphs))`

zerver/lib: Use python 3 syntax for typing. With tweaks by tabbott to fix line spacing. 2017-11-05 11:15:10 +01:00			`def create_test_data() -> None:`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`gens = load_generators(config) # returns a dictionary of generators`

			`paragraphs = parse_file(config, gens, config["corpus"]["filename"])`

test-backend: Move `test_messages.json` to `var/<uuid>/test-backend`. 2019-06-09 20:27:12 +02:00			`write_file(paragraphs, os.path.join(get_or_create_dev_uuid_var_path('test-backend'),`
			`"test_messages.json"))`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
generate_test_data: Remove some useless type annotations. One of these caused a parser error trying to run pyre on Zulip; the other is just useless as the type can be inferred. 2019-12-13 20:52:23 +01:00			`config = load_config()`
Create complex sample messages for dev VM. Create a generator script to pull lines from a play, enhancing random lines with emoji, Markdown and other flair. With numerous contributions from Rein Zustand and Tim Abbott to finish the project. Fixes: #1666. 2017-05-23 23:57:42 +02:00
			`if __name__ == "__main__":`
generate_test_data: Remove some useless type annotations. One of these caused a parser error trying to run pyre on Zulip; the other is just useless as the type can be inferred. 2019-12-13 20:52:23 +01:00			`create_test_data()`