2021-12-09 13:22:00 +01:00
|
|
|
import unicodedata
|
|
|
|
|
|
|
|
from django.db import connection, migrations, models
|
2023-03-04 01:40:40 +01:00
|
|
|
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
|
2021-12-09 13:22:00 +01:00
|
|
|
from django.db.migrations.state import StateApps
|
|
|
|
|
2022-01-11 22:03:11 +01:00
|
|
|
# There are 66 Unicode non-characters; see
|
|
|
|
# https://www.unicode.org/faq/private_use.html#nonchar4
|
2022-01-22 07:55:14 +01:00
|
|
|
unicode_non_chars = {
|
2022-01-11 22:03:11 +01:00
|
|
|
chr(x)
|
2023-09-12 23:19:57 +02:00
|
|
|
for r in [
|
|
|
|
range(0xFDD0, 0xFDF0), # FDD0 through FDEF, inclusive
|
|
|
|
range(0xFFFE, 0x110000, 0x10000), # 0xFFFE, 0x1FFFE, ... 0x10FFFE inclusive
|
|
|
|
range(0xFFFF, 0x110000, 0x10000), # 0xFFFF, 0x1FFFF, ... 0x10FFFF inclusive
|
|
|
|
]
|
|
|
|
for x in r
|
2022-01-22 07:55:14 +01:00
|
|
|
}
|
2022-01-11 22:03:11 +01:00
|
|
|
|
|
|
|
|
|
|
|
def character_is_printable(character: str) -> bool:
|
2022-01-12 00:42:11 +01:00
|
|
|
return not (unicodedata.category(character) in ["Cc", "Cs"] or character in unicode_non_chars)
|
2022-01-11 22:03:11 +01:00
|
|
|
|
2021-12-09 13:22:00 +01:00
|
|
|
|
2022-05-27 23:33:51 +02:00
|
|
|
def fix_topics(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
|
2021-12-09 13:22:00 +01:00
|
|
|
Message = apps.get_model("zerver", "Message")
|
|
|
|
BATCH_SIZE = 10000
|
|
|
|
messages_updated = 0
|
|
|
|
lower_bound = 0
|
|
|
|
|
|
|
|
max_id = Message.objects.aggregate(models.Max("id"))["id__max"]
|
|
|
|
if max_id is None:
|
|
|
|
# Nothing to do if there are no messages.
|
|
|
|
return
|
|
|
|
|
2024-06-30 20:27:42 +02:00
|
|
|
print()
|
2021-12-09 13:22:00 +01:00
|
|
|
while lower_bound < max_id:
|
|
|
|
print(f"Processed {lower_bound} / {max_id}")
|
|
|
|
with connection.cursor() as cursor:
|
|
|
|
cursor.execute(
|
|
|
|
"SELECT DISTINCT subject FROM zerver_message WHERE id > %s AND id <= %s",
|
|
|
|
[lower_bound, lower_bound + BATCH_SIZE],
|
|
|
|
)
|
|
|
|
|
|
|
|
results = cursor.fetchall()
|
|
|
|
|
|
|
|
topics = [r[0] for r in results]
|
|
|
|
for topic in topics:
|
|
|
|
fixed_topic = "".join(
|
2022-01-12 00:58:37 +01:00
|
|
|
character for character in topic if character_is_printable(character)
|
2021-12-09 13:22:00 +01:00
|
|
|
)
|
|
|
|
if fixed_topic == topic:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# We don't want empty topics for stream messages, so we
|
|
|
|
# use (no topic) if the above clean-up leaves us with an empty string.
|
|
|
|
if fixed_topic == "":
|
|
|
|
fixed_topic = "(no topic)"
|
|
|
|
|
|
|
|
cursor.execute(
|
|
|
|
"UPDATE zerver_message SET subject = %s WHERE subject = %s AND id > %s AND id <= %s",
|
|
|
|
[fixed_topic, topic, lower_bound, lower_bound + BATCH_SIZE],
|
|
|
|
)
|
|
|
|
messages_updated += cursor.rowcount
|
|
|
|
lower_bound += BATCH_SIZE
|
|
|
|
|
|
|
|
if messages_updated > 0:
|
|
|
|
print(f"Fixed invalid topics for {messages_updated} messages.")
|
|
|
|
|
|
|
|
|
|
|
|
class Migration(migrations.Migration):
|
|
|
|
atomic = False
|
|
|
|
|
|
|
|
dependencies = [
|
|
|
|
("zerver", "0370_realm_enable_spectator_access"),
|
|
|
|
]
|
|
|
|
|
|
|
|
operations = [
|
|
|
|
migrations.RunPython(fix_topics, reverse_code=migrations.RunPython.noop),
|
|
|
|
]
|