zulip/zerver/data_import/rocketchat.py

1260 lines
47 KiB
Python
Raw Normal View History

import hashlib
import logging
import os
import random
import secrets
import uuid
from typing import Any, Dict, List, Set, Tuple
import bson
from django.conf import settings
from django.forms.models import model_to_dict
from zerver.data_import.import_util import (
SubscriberHandler,
ZerverFieldsT,
build_attachment,
build_huddle,
build_huddle_subscriptions,
build_message,
build_personal_subscriptions,
build_realm,
build_realm_emoji,
build_recipients,
build_stream,
build_stream_subscriptions,
build_user_profile,
build_zerver_realm,
create_converted_data_files,
make_subscriber_map,
make_user_messages,
)
from zerver.data_import.sequencer import NEXT_ID, IdMapper
from zerver.data_import.user_handler import UserHandler
from zerver.lib.emoji import name_to_codepoint
from zerver.lib.markdown import IMAGE_EXTENSIONS
from zerver.lib.upload.base import sanitize_name
from zerver.lib.utils import process_list_in_batches
from zerver.models import Reaction, RealmEmoji, Recipient, UserProfile
def make_realm(
realm_id: int, realm_subdomain: str, domain_name: str, rc_instance: Dict[str, Any]
) -> ZerverFieldsT:
created_at = float(rc_instance["_createdAt"].timestamp())
zerver_realm = build_zerver_realm(realm_id, realm_subdomain, created_at, "Rocket.Chat")
realm = build_realm(zerver_realm, realm_id, domain_name)
# We may override these later.
realm["zerver_defaultstream"] = []
return realm
def process_users(
user_id_to_user_map: Dict[str, Dict[str, Any]],
realm_id: int,
domain_name: str,
user_handler: UserHandler,
user_id_mapper: IdMapper,
) -> None:
realm_owners: List[int] = []
bots: List[int] = []
for rc_user_id in user_id_to_user_map:
user_dict = user_id_to_user_map[rc_user_id]
is_mirror_dummy = False
is_bot = False
is_active = True
# Rocket.Chat has three user types:
# "user": This is a regular user of the system.
# "bot": A special user types for bots.
# "unknown": This usually represents a livechat guest.
if user_dict["type"] != "user":
is_active = False
if user_dict["type"] == "bot":
is_bot = True
else:
is_mirror_dummy = True
if user_dict.get("emails") is None:
user_dict["emails"] = [
{
"address": "{}-{}@{}".format(
user_dict["username"], user_dict["type"], domain_name
)
}
]
# TODO: Change this to use actual exported avatar
avatar_source = "G"
full_name = user_dict["name"]
id = user_id_mapper.get(rc_user_id)
delivery_email = user_dict["emails"][0]["address"]
email = user_dict["emails"][0]["address"]
short_name = user_dict["username"]
date_joined = float(user_dict["createdAt"].timestamp())
timezone = "UTC"
role = UserProfile.ROLE_MEMBER
if "admin" in user_dict["roles"]:
role = UserProfile.ROLE_REALM_OWNER
realm_owners.append(id)
elif "guest" in user_dict["roles"]:
role = UserProfile.ROLE_GUEST
elif "bot" in user_dict["roles"]:
is_bot = True
if is_bot:
bots.append(id)
user = build_user_profile(
avatar_source=avatar_source,
date_joined=date_joined,
delivery_email=delivery_email,
email=email,
full_name=full_name,
id=id,
is_active=is_active,
role=role,
is_mirror_dummy=is_mirror_dummy,
realm_id=realm_id,
short_name=short_name,
timezone=timezone,
is_bot=is_bot,
bot_type=1 if is_bot else None,
)
user_handler.add_user(user)
# Set the first realm_owner as the owner of
# all the bots.
if realm_owners:
for bot_id in bots:
bot_user = user_handler.get_user(user_id=bot_id)
bot_user["bot_owner"] = realm_owners[0]
def truncate_name(name: str, name_id: int, max_length: int = 60) -> str:
if len(name) > max_length:
name_id_suffix = f" [{name_id}]"
name = name[0 : max_length - len(name_id_suffix)] + name_id_suffix
return name
def get_stream_name(rc_channel: Dict[str, Any]) -> str:
if rc_channel.get("teamMain"):
stream_name = f'[TEAM] {rc_channel["name"]}'
else:
stream_name = rc_channel["name"]
stream_name = truncate_name(stream_name, rc_channel["_id"])
return stream_name
def convert_channel_data(
room_id_to_room_map: Dict[str, Dict[str, Any]],
team_id_to_team_map: Dict[str, Dict[str, Any]],
stream_id_mapper: IdMapper,
realm_id: int,
) -> List[ZerverFieldsT]:
streams = []
for rc_room_id in room_id_to_room_map:
channel_dict = room_id_to_room_map[rc_room_id]
date_created = float(channel_dict["ts"].timestamp())
stream_id = stream_id_mapper.get(rc_room_id)
invite_only = channel_dict["t"] == "p"
stream_name = get_stream_name(channel_dict)
stream_desc = channel_dict.get("description", "")
if channel_dict.get("teamId") and not channel_dict.get("teamMain"):
stream_desc = "[Team {} channel]. {}".format(
team_id_to_team_map[channel_dict["teamId"]]["name"], stream_desc
)
# If the channel is read-only, then only admins and moderators
# should be allowed to post in the converted Zulip stream.
# For more details: https://zulip.com/help/stream-sending-policy
#
# See `Stream` model in `zerver/models/streams.py` to know about what each
# number represent.
stream_post_policy = 4 if channel_dict.get("ro", False) else 1
stream = build_stream(
date_created=date_created,
realm_id=realm_id,
name=stream_name,
description=stream_desc,
stream_id=stream_id,
deactivated=False,
invite_only=invite_only,
stream_post_policy=stream_post_policy,
)
streams.append(stream)
return streams
def convert_stream_subscription_data(
user_id_to_user_map: Dict[str, Dict[str, Any]],
dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
zerver_stream: List[ZerverFieldsT],
stream_id_mapper: IdMapper,
user_id_mapper: IdMapper,
subscriber_handler: SubscriberHandler,
) -> None:
stream_members_map: Dict[int, Set[int]] = {}
for rc_user_id in user_id_to_user_map:
user_dict = user_id_to_user_map[rc_user_id]
if not user_dict.get("__rooms"):
continue
for channel in user_dict["__rooms"]:
if channel in dsc_id_to_dsc_map:
# Ignore discussion rooms as these are not
# imported as streams, but topics.
continue
stream_id = stream_id_mapper.get(channel)
if stream_id not in stream_members_map:
stream_members_map[stream_id] = set()
stream_members_map[stream_id].add(user_id_mapper.get(rc_user_id))
for stream in zerver_stream:
if stream["id"] in stream_members_map:
users = stream_members_map[stream["id"]]
else:
users = set()
# Set the stream without any subscribers
# as deactivated.
stream["deactivated"] = True
subscriber_handler.set_info(users=users, stream_id=stream["id"])
def convert_huddle_data(
huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
huddle_id_mapper: IdMapper,
user_id_mapper: IdMapper,
subscriber_handler: SubscriberHandler,
) -> List[ZerverFieldsT]:
zerver_huddle: List[ZerverFieldsT] = []
for rc_huddle_id in huddle_id_to_huddle_map:
huddle_id = huddle_id_mapper.get(rc_huddle_id)
huddle = build_huddle(huddle_id)
zerver_huddle.append(huddle)
huddle_dict = huddle_id_to_huddle_map[rc_huddle_id]
huddle_user_ids = set()
for rc_user_id in huddle_dict["uids"]:
huddle_user_ids.add(user_id_mapper.get(rc_user_id))
subscriber_handler.set_info(
users=huddle_user_ids,
huddle_id=huddle_id,
)
return zerver_huddle
def build_custom_emoji(
realm_id: int, custom_emoji_data: Dict[str, List[Dict[str, Any]]], output_dir: str
) -> List[ZerverFieldsT]:
logging.info("Starting to process custom emoji")
emoji_folder = os.path.join(output_dir, "emoji")
os.makedirs(emoji_folder, exist_ok=True)
zerver_realmemoji: List[ZerverFieldsT] = []
emoji_records: List[ZerverFieldsT] = []
# Map emoji file_id to emoji file data
emoji_file_data = {}
for emoji_file in custom_emoji_data["file"]:
emoji_file_data[emoji_file["_id"]] = {"filename": emoji_file["filename"], "chunks": []}
for emoji_chunk in custom_emoji_data["chunk"]:
emoji_file_data[emoji_chunk["files_id"]]["chunks"].append(emoji_chunk["data"])
# Build custom emoji
for rc_emoji in custom_emoji_data["emoji"]:
# Subject to change with changes in database
emoji_file_id = f'{rc_emoji["name"]}.{rc_emoji["extension"]}'
emoji_file_info = emoji_file_data[emoji_file_id]
emoji_filename = emoji_file_info["filename"]
emoji_data = b"".join(emoji_file_info["chunks"])
target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format(
realm_id=realm_id,
emoji_file_name=emoji_filename,
)
target_path = os.path.join(emoji_folder, target_sub_path)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with open(target_path, "wb") as e_file:
e_file.write(emoji_data)
emoji_aliases = [rc_emoji["name"]]
emoji_aliases.extend(rc_emoji["aliases"])
for alias in emoji_aliases:
emoji_record = dict(
path=target_path,
s3_path=target_path,
file_name=emoji_filename,
realm_id=realm_id,
name=alias,
)
emoji_records.append(emoji_record)
realmemoji = build_realm_emoji(
realm_id=realm_id,
name=alias,
id=NEXT_ID("realmemoji"),
file_name=emoji_filename,
)
zerver_realmemoji.append(realmemoji)
create_converted_data_files(emoji_records, output_dir, "/emoji/records.json")
logging.info("Done processing emoji")
return zerver_realmemoji
def build_reactions(
total_reactions: List[ZerverFieldsT],
reactions: List[Dict[str, Any]],
message_id: int,
zerver_realmemoji: List[ZerverFieldsT],
) -> None:
realmemoji = {}
for emoji in zerver_realmemoji:
realmemoji[emoji["name"]] = emoji["id"]
# For the Unicode emoji codes, we use equivalent of
# function 'emoji_name_to_emoji_code' in 'zerver/lib/emoji' here
for reaction_dict in reactions:
emoji_name = reaction_dict["name"]
user_id = reaction_dict["user_id"]
# Check in realm emoji
if emoji_name in realmemoji:
emoji_code = realmemoji[emoji_name]
reaction_type = Reaction.REALM_EMOJI
# Check in Unicode emoji
elif emoji_name in name_to_codepoint:
emoji_code = name_to_codepoint[emoji_name]
reaction_type = Reaction.UNICODE_EMOJI
else: # nocoverage
continue
reaction_id = NEXT_ID("reaction")
reaction = Reaction(
id=reaction_id,
emoji_code=emoji_code,
emoji_name=emoji_name,
reaction_type=reaction_type,
)
reaction_dict = model_to_dict(reaction, exclude=["message", "user_profile"])
reaction_dict["message"] = message_id
reaction_dict["user_profile"] = user_id
total_reactions.append(reaction_dict)
def process_message_attachment(
upload: Dict[str, Any],
realm_id: int,
message_id: int,
user_id: int,
user_handler: UserHandler,
zerver_attachment: List[ZerverFieldsT],
uploads_list: List[ZerverFieldsT],
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
output_dir: str,
) -> Tuple[str, bool]:
if upload["_id"] not in upload_id_to_upload_data_map: # nocoverage
logging.info("Skipping unknown attachment of message_id: %s", message_id)
return "", False
if "type" not in upload: # nocoverage
logging.info("Skipping attachment without type of message_id: %s", message_id)
return "", False
upload_file_data = upload_id_to_upload_data_map[upload["_id"]]
file_name = upload["name"]
file_ext = f'.{upload["type"].split("/")[-1]}'
has_image = False
if file_ext.lower() in IMAGE_EXTENSIONS:
has_image = True
try:
sanitized_name = sanitize_name(file_name)
except AssertionError: # nocoverage
logging.info("Replacing invalid attachment name with random uuid: %s", file_name)
sanitized_name = uuid.uuid4().hex
if len(sanitized_name) >= 255: # nocoverage
logging.info("Replacing too long attachment name with random uuid: %s", file_name)
sanitized_name = uuid.uuid4().hex
s3_path = "/".join(
[
str(realm_id),
format(random.randint(0, 255), "x"),
secrets.token_urlsafe(18),
sanitized_name,
]
)
# Build the attachment from chunks and save it to s3_path.
file_out_path = os.path.join(output_dir, "uploads", s3_path)
os.makedirs(os.path.dirname(file_out_path), exist_ok=True)
with open(file_out_path, "wb") as upload_file:
upload_file.write(b"".join(upload_file_data["chunk"]))
attachment_content = (
f'{upload_file_data.get("description", "")}\n\n[{file_name}](/user_uploads/{s3_path})'
)
fileinfo = {
"name": file_name,
"size": upload_file_data["size"],
"created": float(upload_file_data["_updatedAt"].timestamp()),
}
upload = dict(
path=s3_path,
realm_id=realm_id,
content_type=upload["type"],
user_profile_id=user_id,
last_modified=fileinfo["created"],
user_profile_email=user_handler.get_user(user_id=user_id)["email"],
s3_path=s3_path,
size=fileinfo["size"],
)
uploads_list.append(upload)
build_attachment(
realm_id=realm_id,
message_ids={message_id},
user_id=user_id,
fileinfo=fileinfo,
s3_path=s3_path,
zerver_attachment=zerver_attachment,
)
return attachment_content, has_image
def process_raw_message_batch(
realm_id: int,
raw_messages: List[Dict[str, Any]],
subscriber_map: Dict[int, Set[int]],
user_handler: UserHandler,
is_pm_data: bool,
output_dir: str,
zerver_realmemoji: List[ZerverFieldsT],
total_reactions: List[ZerverFieldsT],
uploads_list: List[ZerverFieldsT],
zerver_attachment: List[ZerverFieldsT],
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
) -> None:
def fix_mentions(
content: str, mention_user_ids: Set[int], rc_channel_mention_data: List[Dict[str, str]]
) -> str:
# Fix user mentions
for user_id in mention_user_ids:
user = user_handler.get_user(user_id=user_id)
rc_mention = "@{short_name}".format(**user)
zulip_mention = "@**{full_name}**".format(**user)
content = content.replace(rc_mention, zulip_mention)
content = content.replace("@all", "@**all**")
# We don't have an equivalent for Rocket.Chat's @here mention
# which mentions all users active in the channel.
content = content.replace("@here", "@**all**")
# Fix channel mentions
for mention_data in rc_channel_mention_data:
rc_mention = mention_data["rc_mention"]
zulip_mention = mention_data["zulip_mention"]
content = content.replace(rc_mention, zulip_mention)
return content
user_mention_map: Dict[int, Set[int]] = {}
wildcard_mention_map: Dict[int, bool] = {}
zerver_message: List[ZerverFieldsT] = []
for raw_message in raw_messages:
message_id = NEXT_ID("message")
mention_user_ids = raw_message["mention_user_ids"]
user_mention_map[message_id] = mention_user_ids
wildcard_mention_map[message_id] = raw_message["wildcard_mention"]
content = fix_mentions(
content=raw_message["content"],
mention_user_ids=mention_user_ids,
rc_channel_mention_data=raw_message["rc_channel_mention_data"],
)
if len(content) > 10000: # nocoverage
logging.info("skipping too-long message of length %s", len(content))
continue
date_sent = raw_message["date_sent"]
sender_user_id = raw_message["sender_id"]
recipient_id = raw_message["recipient_id"]
rendered_content = None
has_attachment = False
has_image = False
has_link = raw_message["has_link"]
if "file" in raw_message:
has_attachment = True
has_link = True
attachment_content, has_image = process_message_attachment(
upload=raw_message["file"],
realm_id=realm_id,
message_id=message_id,
user_id=sender_user_id,
user_handler=user_handler,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
output_dir=output_dir,
)
content += attachment_content
topic_name = raw_message["topic_name"]
message = build_message(
content=content,
message_id=message_id,
date_sent=date_sent,
recipient_id=recipient_id,
realm_id=realm_id,
rendered_content=rendered_content,
topic_name=topic_name,
user_id=sender_user_id,
has_image=has_image,
has_link=has_link,
has_attachment=has_attachment,
)
zerver_message.append(message)
build_reactions(
total_reactions=total_reactions,
reactions=raw_message["reactions"],
message_id=message_id,
zerver_realmemoji=zerver_realmemoji,
)
zerver_usermessage = make_user_messages(
zerver_message=zerver_message,
subscriber_map=subscriber_map,
is_pm_data=is_pm_data,
mention_map=user_mention_map,
wildcard_mention_map=wildcard_mention_map,
)
message_json = dict(
zerver_message=zerver_message,
zerver_usermessage=zerver_usermessage,
)
dump_file_id = NEXT_ID("dump_file_id" + str(realm_id))
message_file = f"/messages-{dump_file_id:06}.json"
create_converted_data_files(message_json, output_dir, message_file)
def get_topic_name(
message: Dict[str, Any],
dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
thread_id_mapper: IdMapper,
is_pm_data: bool = False,
) -> str:
if is_pm_data:
return ""
elif message["rid"] in dsc_id_to_dsc_map:
dsc_channel_name = dsc_id_to_dsc_map[message["rid"]]["fname"]
return truncate_name(f"{dsc_channel_name} (Imported from Rocket.Chat)", message["rid"])
elif message.get("replies"):
# Message is the start of a thread
thread_id = thread_id_mapper.get(message["_id"])
return truncate_name(f"Thread {thread_id} (Imported from Rocket.Chat)", message["_id"])
elif message.get("tmid"):
# Message is a part of a thread
thread_id = thread_id_mapper.get(message["tmid"])
return truncate_name(f"Thread {thread_id} (Imported from Rocket.Chat)", message["tmid"])
else:
# Normal channel message
return "Imported from Rocket.Chat"
def process_messages(
realm_id: int,
messages: List[Dict[str, Any]],
subscriber_map: Dict[int, Set[int]],
is_pm_data: bool,
username_to_user_id_map: Dict[str, str],
user_id_mapper: IdMapper,
user_handler: UserHandler,
user_id_to_recipient_id: Dict[int, int],
stream_id_mapper: IdMapper,
stream_id_to_recipient_id: Dict[int, int],
huddle_id_mapper: IdMapper,
huddle_id_to_recipient_id: Dict[int, int],
thread_id_mapper: IdMapper,
room_id_to_room_map: Dict[str, Dict[str, Any]],
dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
direct_id_to_direct_map: Dict[str, Dict[str, Any]],
huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
zerver_realmemoji: List[ZerverFieldsT],
total_reactions: List[ZerverFieldsT],
uploads_list: List[ZerverFieldsT],
zerver_attachment: List[ZerverFieldsT],
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
output_dir: str,
) -> None:
def list_reactions(reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
# List of dictionaries of form:
# {"name": "smile", "user_id": 2}
reactions_list: List[Dict[str, Any]] = []
for react_code in reactions:
name = react_code.split(":")[1]
usernames = reactions[react_code]["usernames"]
for username in usernames:
if username not in username_to_user_id_map: # nocoverage
# This can happen with production data when old user names no longer exist. We cannot do
# much about it here so we just ignore the unknown user name.
continue
rc_user_id = username_to_user_id_map[username]
user_id = user_id_mapper.get(rc_user_id)
reactions_list.append({"name": name, "user_id": user_id})
return reactions_list
def message_to_dict(message: Dict[str, Any]) -> Dict[str, Any]:
rc_sender_id = message["u"]["_id"]
sender_id = user_id_mapper.get(rc_sender_id)
if "msg" in message:
content = message["msg"]
else: # nocoverage
content = "This message imported from Rocket.Chat had no body in the data export."
logging.info(
"Message %s contains no message content: %s",
message["_id"],
message,
)
if message.get("reactions"):
reactions = list_reactions(message["reactions"])
else:
reactions = []
message_dict = dict(
sender_id=sender_id,
content=content,
date_sent=int(message["ts"].timestamp()),
reactions=reactions,
has_link=bool(message.get("urls")),
)
# Add recipient_id to message_dict
if is_pm_data:
# Message is in a 1:1 or group direct message.
rc_channel_id = message["rid"]
if rc_channel_id in huddle_id_to_huddle_map:
huddle_id = huddle_id_mapper.get(rc_channel_id)
message_dict["recipient_id"] = huddle_id_to_recipient_id[huddle_id]
else:
rc_member_ids = direct_id_to_direct_map[rc_channel_id]["uids"]
if len(rc_member_ids) == 1: # nocoverage
# direct messages to yourself only have one user.
rc_member_ids.append(rc_member_ids[0])
if rc_sender_id == rc_member_ids[0]:
zulip_member_id = user_id_mapper.get(rc_member_ids[1])
message_dict["recipient_id"] = user_id_to_recipient_id[zulip_member_id]
else:
zulip_member_id = user_id_mapper.get(rc_member_ids[0])
message_dict["recipient_id"] = user_id_to_recipient_id[zulip_member_id]
elif message["rid"] in dsc_id_to_dsc_map:
# Message is in a discussion
dsc_channel = dsc_id_to_dsc_map[message["rid"]]
parent_channel_id = dsc_channel["prid"]
stream_id = stream_id_mapper.get(parent_channel_id)
message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]
else:
stream_id = stream_id_mapper.get(message["rid"])
message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]
# Add topic name to message_dict
message_dict["topic_name"] = get_topic_name(
message, dsc_id_to_dsc_map, thread_id_mapper, is_pm_data
)
# Add user mentions to message_dict
mention_user_ids = set()
wildcard_mention = False
for mention in message.get("mentions", []):
mention_id = mention["_id"]
if mention_id in ["all", "here"]:
wildcard_mention = True
continue
if user_id_mapper.has(mention_id):
user_id = user_id_mapper.get(mention_id)
mention_user_ids.add(user_id)
else: # nocoverage
logging.info(
"Message %s contains mention of unknown user %s: %s",
message["_id"],
mention_id,
mention,
)
message_dict["mention_user_ids"] = mention_user_ids
message_dict["wildcard_mention"] = wildcard_mention
# Add channel mentions to message_dict
rc_channel_mention_data: List[Dict[str, str]] = []
for mention in message.get("channels", []):
mention_rc_channel_id = mention["_id"]
mention_rc_channel_name = mention["name"]
rc_mention = f"#{mention_rc_channel_name}"
if mention_rc_channel_id in room_id_to_room_map:
# Channel is converted to a stream.
rc_channel = room_id_to_room_map[mention_rc_channel_id]
converted_stream_name = get_stream_name(rc_channel)
zulip_mention = f"#**{converted_stream_name}**"
elif mention_rc_channel_id in dsc_id_to_dsc_map:
# Channel is a discussion and is converted to a topic.
dsc_channel = dsc_id_to_dsc_map[mention_rc_channel_id]
parent_channel_id = dsc_channel["prid"]
if (
parent_channel_id in direct_id_to_direct_map
or parent_channel_id in huddle_id_to_huddle_map
):
# Discussion belongs to a direct channel and thus, should not be
# linked.
# This logging statement serves the side benefit of avoiding the
# CPython optimization for `continue` so that the coverage reports
# aren't misleading.
logging.info(
"skipping direct messages discussion mention: %s", dsc_channel["fname"]
)
continue
converted_topic_name = get_topic_name(
message={"rid": mention_rc_channel_id},
dsc_id_to_dsc_map=dsc_id_to_dsc_map,
thread_id_mapper=thread_id_mapper,
)
parent_rc_channel = room_id_to_room_map[parent_channel_id]
parent_stream_name = get_stream_name(parent_rc_channel)
zulip_mention = f"#**{parent_stream_name}>{converted_topic_name}**"
else: # nocoverage
logging.info("Failed to map mention '%s' to zulip syntax.", mention)
continue
mention_data = {"rc_mention": rc_mention, "zulip_mention": zulip_mention}
rc_channel_mention_data.append(mention_data)
message_dict["rc_channel_mention_data"] = rc_channel_mention_data
# Add uploaded file (attachment) to message_dict
if message.get("file"):
message_dict["file"] = message["file"]
return message_dict
raw_messages: List[Dict[str, Any]] = []
for message in messages:
if message.get("t") is not None:
# Messages with a type are system notifications like user_joined
# that we don't include.
continue
raw_messages.append(message_to_dict(message))
def process_batch(lst: List[Dict[str, Any]]) -> None:
process_raw_message_batch(
realm_id=realm_id,
raw_messages=lst,
subscriber_map=subscriber_map,
user_handler=user_handler,
is_pm_data=is_pm_data,
output_dir=output_dir,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
)
chunk_size = 1000
process_list_in_batches(
lst=raw_messages,
chunk_size=chunk_size,
process_batch=process_batch,
)
def map_upload_id_to_upload_data(
upload_data: Dict[str, List[Dict[str, Any]]],
) -> Dict[str, Dict[str, Any]]:
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]] = {}
for upload in upload_data["upload"]:
upload_id_to_upload_data_map[upload["_id"]] = {**upload, "chunk": []}
for chunk in upload_data["chunk"]:
if chunk["files_id"] not in upload_id_to_upload_data_map: # nocoverage
logging.info("Skipping chunk %s without metadata", chunk["files_id"])
# It's unclear why this apparent data corruption in the
# Rocket.Chat database is possible, but empirically, some
# chunks don't have any associated metadata.
continue
upload_id_to_upload_data_map[chunk["files_id"]]["chunk"].append(chunk["data"])
return upload_id_to_upload_data_map
def separate_channel_private_and_livechat_messages(
messages: List[Dict[str, Any]],
dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
direct_id_to_direct_map: Dict[str, Dict[str, Any]],
huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
livechat_id_to_livechat_map: Dict[str, Dict[str, Any]],
channel_messages: List[Dict[str, Any]],
private_messages: List[Dict[str, Any]],
livechat_messages: List[Dict[str, Any]],
) -> None:
private_channels_list = [*direct_id_to_direct_map, *huddle_id_to_huddle_map]
for message in messages:
if not message.get("rid"):
# Message does not belong to any channel (might be
# related to livechat), so ignore all such messages.
continue
if message["rid"] in dsc_id_to_dsc_map:
parent_channel_id = dsc_id_to_dsc_map[message["rid"]]["prid"]
if parent_channel_id in private_channels_list:
# Messages in discussions originating from direct channels
# are treated as if they were posted in the parent direct
# channel only.
message["rid"] = parent_channel_id
if message["rid"] in private_channels_list:
private_messages.append(message)
elif message["rid"] in livechat_id_to_livechat_map:
livechat_messages.append(message)
else:
channel_messages.append(message)
def map_receiver_id_to_recipient_id(
zerver_recipient: List[ZerverFieldsT],
stream_id_to_recipient_id: Dict[int, int],
huddle_id_to_recipient_id: Dict[int, int],
user_id_to_recipient_id: Dict[int, int],
) -> None:
# receiver_id represents stream_id/huddle_id/user_id
for recipient in zerver_recipient:
if recipient["type"] == Recipient.STREAM:
stream_id_to_recipient_id[recipient["type_id"]] = recipient["id"]
elif recipient["type"] == Recipient.HUDDLE:
huddle_id_to_recipient_id[recipient["type_id"]] = recipient["id"]
elif recipient["type"] == Recipient.PERSONAL:
user_id_to_recipient_id[recipient["type_id"]] = recipient["id"]
# This is inspired by get_huddle_hash from zerver/models/recipients.py. It
# expects strings identifying Rocket.Chat users, like
# `LdBZ7kPxtKESyHPEe`, not integer IDs.
#
# Its purpose is to be a stable map usable for deduplication/merging
# of Rocket.Chat threads involving the same set of people. Thus, its
# only important property is that if two sets of users S and T are
# equal and thus will have the same actual huddle hash once imported,
# that get_string_huddle_hash(S) = get_string_huddle_hash(T).
def get_string_huddle_hash(id_list: List[str]) -> str:
id_list = sorted(set(id_list))
hash_key = ",".join(str(x) for x in id_list)
return hashlib.sha1(hash_key.encode()).hexdigest()
def categorize_channels_and_map_with_id(
channel_data: List[Dict[str, Any]],
room_id_to_room_map: Dict[str, Dict[str, Any]],
team_id_to_team_map: Dict[str, Dict[str, Any]],
dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
direct_id_to_direct_map: Dict[str, Dict[str, Any]],
huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
livechat_id_to_livechat_map: Dict[str, Dict[str, Any]],
) -> None:
huddle_hashed_channels: Dict[str, Any] = {}
for channel in channel_data:
if channel.get("prid"):
dsc_id_to_dsc_map[channel["_id"]] = channel
elif channel["t"] == "d":
if len(channel["uids"]) > 2:
huddle_hash = get_string_huddle_hash(channel["uids"])
logging.info(
"Huddle channel found. UIDs: %s -> hash %s", channel["uids"], huddle_hash
)
if channel["msgs"] == 0: # nocoverage
# Rocket.Chat exports in the wild sometimes
# contain duplicates of real huddles, with no
# messages in the duplicate. We ignore these
# minor database corruptions in the Rocket.Chat
# export. Doing so is safe, because a huddle with no
# message history has no value in Zulip's data
# model.
logging.debug("Skipping huddle with 0 messages: %s", channel)
elif huddle_hash in huddle_hashed_channels: # nocoverage
logging.info(
"Mapping huddle hash %s to existing channel: %s",
huddle_hash,
huddle_hashed_channels[huddle_hash],
)
huddle_id_to_huddle_map[channel["_id"]] = huddle_hashed_channels[huddle_hash]
# Ideally, we'd merge the duplicate huddles. Doing
# so correctly requires special handling in
# convert_huddle_data() and on the message import
# side as well, since those appear to be mapped
# via rocketchat channel IDs and not all of that
# information is resolved via the
# huddle_id_to_huddle_map.
#
# For now, just throw an exception here rather
# than during the import process.
raise NotImplementedError(
"Mapping multiple huddles with messages to one is not fully implemented yet"
)
else:
huddle_id_to_huddle_map[channel["_id"]] = channel
huddle_hashed_channels[huddle_hash] = channel
else:
direct_id_to_direct_map[channel["_id"]] = channel
elif channel["t"] == "l":
livechat_id_to_livechat_map[channel["_id"]] = channel
else:
room_id_to_room_map[channel["_id"]] = channel
if channel.get("teamMain"):
team_id_to_team_map[channel["teamId"]] = channel
def map_username_to_user_id(user_id_to_user_map: Dict[str, Dict[str, Any]]) -> Dict[str, str]:
username_to_user_id_map: Dict[str, str] = {}
for user_id, user_dict in user_id_to_user_map.items():
username_to_user_id_map[user_dict["username"]] = user_id
return username_to_user_id_map
def map_user_id_to_user(user_data_list: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
user_id_to_user_map = {}
for user in user_data_list:
user_id_to_user_map[user["_id"]] = user
return user_id_to_user_map
def rocketchat_data_to_dict(rocketchat_data_dir: str) -> Dict[str, Any]:
codec_options = bson.DEFAULT_CODEC_OPTIONS.with_options(tz_aware=True)
rocketchat_data: Dict[str, Any] = {}
rocketchat_data["instance"] = []
rocketchat_data["user"] = []
rocketchat_data["avatar"] = {"avatar": [], "file": [], "chunk": []}
rocketchat_data["room"] = []
rocketchat_data["message"] = []
rocketchat_data["custom_emoji"] = {"emoji": [], "file": [], "chunk": []}
rocketchat_data["upload"] = {"upload": [], "file": [], "chunk": []}
# Get instance
with open(os.path.join(rocketchat_data_dir, "instances.bson"), "rb") as fcache:
rocketchat_data["instance"] = bson.decode_all(fcache.read(), codec_options)
# Get user
with open(os.path.join(rocketchat_data_dir, "users.bson"), "rb") as fcache:
rocketchat_data["user"] = bson.decode_all(fcache.read(), codec_options)
# Get avatar
with open(os.path.join(rocketchat_data_dir, "rocketchat_avatars.bson"), "rb") as fcache:
rocketchat_data["avatar"]["avatar"] = bson.decode_all(fcache.read(), codec_options)
if rocketchat_data["avatar"]["avatar"]:
with open(
os.path.join(rocketchat_data_dir, "rocketchat_avatars.files.bson"), "rb"
) as fcache:
rocketchat_data["avatar"]["file"] = bson.decode_all(fcache.read(), codec_options)
with open(
os.path.join(rocketchat_data_dir, "rocketchat_avatars.chunks.bson"), "rb"
) as fcache:
rocketchat_data["avatar"]["chunk"] = bson.decode_all(fcache.read(), codec_options)
# Get room
with open(os.path.join(rocketchat_data_dir, "rocketchat_room.bson"), "rb") as fcache:
rocketchat_data["room"] = bson.decode_all(fcache.read(), codec_options)
# Get messages
with open(os.path.join(rocketchat_data_dir, "rocketchat_message.bson"), "rb") as fcache:
rocketchat_data["message"] = bson.decode_all(fcache.read(), codec_options)
# Get custom emoji
with open(os.path.join(rocketchat_data_dir, "rocketchat_custom_emoji.bson"), "rb") as fcache:
rocketchat_data["custom_emoji"]["emoji"] = bson.decode_all(fcache.read(), codec_options)
if rocketchat_data["custom_emoji"]["emoji"]:
with open(os.path.join(rocketchat_data_dir, "custom_emoji.files.bson"), "rb") as fcache:
rocketchat_data["custom_emoji"]["file"] = bson.decode_all(fcache.read(), codec_options)
with open(os.path.join(rocketchat_data_dir, "custom_emoji.chunks.bson"), "rb") as fcache:
rocketchat_data["custom_emoji"]["chunk"] = bson.decode_all(fcache.read(), codec_options)
# Get uploads
with open(os.path.join(rocketchat_data_dir, "rocketchat_uploads.bson"), "rb") as fcache:
rocketchat_data["upload"]["upload"] = bson.decode_all(fcache.read(), codec_options)
if rocketchat_data["upload"]["upload"]:
with open(
os.path.join(rocketchat_data_dir, "rocketchat_uploads.files.bson"), "rb"
) as fcache:
rocketchat_data["upload"]["file"] = bson.decode_all(fcache.read(), codec_options)
with open(
os.path.join(rocketchat_data_dir, "rocketchat_uploads.chunks.bson"), "rb"
) as fcache:
rocketchat_data["upload"]["chunk"] = bson.decode_all(fcache.read(), codec_options)
return rocketchat_data
def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None:
# Get all required exported data in a dictionary
rocketchat_data = rocketchat_data_to_dict(rocketchat_data_dir)
# Subdomain is set by the user while running the import command
realm_subdomain = ""
realm_id = 0
domain_name = settings.EXTERNAL_HOST
realm = make_realm(realm_id, realm_subdomain, domain_name, rocketchat_data["instance"][0])
user_id_to_user_map: Dict[str, Dict[str, Any]] = map_user_id_to_user(rocketchat_data["user"])
username_to_user_id_map: Dict[str, str] = map_username_to_user_id(user_id_to_user_map)
user_handler = UserHandler()
subscriber_handler = SubscriberHandler()
user_id_mapper = IdMapper()
stream_id_mapper = IdMapper()
huddle_id_mapper = IdMapper()
thread_id_mapper = IdMapper()
process_users(
user_id_to_user_map=user_id_to_user_map,
realm_id=realm_id,
domain_name=domain_name,
user_handler=user_handler,
user_id_mapper=user_id_mapper,
)
room_id_to_room_map: Dict[str, Dict[str, Any]] = {}
team_id_to_team_map: Dict[str, Dict[str, Any]] = {}
dsc_id_to_dsc_map: Dict[str, Dict[str, Any]] = {}
direct_id_to_direct_map: Dict[str, Dict[str, Any]] = {}
huddle_id_to_huddle_map: Dict[str, Dict[str, Any]] = {}
livechat_id_to_livechat_map: Dict[str, Dict[str, Any]] = {}
categorize_channels_and_map_with_id(
channel_data=rocketchat_data["room"],
room_id_to_room_map=room_id_to_room_map,
team_id_to_team_map=team_id_to_team_map,
dsc_id_to_dsc_map=dsc_id_to_dsc_map,
direct_id_to_direct_map=direct_id_to_direct_map,
huddle_id_to_huddle_map=huddle_id_to_huddle_map,
livechat_id_to_livechat_map=livechat_id_to_livechat_map,
)
zerver_stream = convert_channel_data(
room_id_to_room_map=room_id_to_room_map,
team_id_to_team_map=team_id_to_team_map,
stream_id_mapper=stream_id_mapper,
realm_id=realm_id,
)
realm["zerver_stream"] = zerver_stream
# Add stream subscription data to `subscriber_handler`
convert_stream_subscription_data(
user_id_to_user_map=user_id_to_user_map,
dsc_id_to_dsc_map=dsc_id_to_dsc_map,
zerver_stream=zerver_stream,
stream_id_mapper=stream_id_mapper,
user_id_mapper=user_id_mapper,
subscriber_handler=subscriber_handler,
)
zerver_huddle = convert_huddle_data(
huddle_id_to_huddle_map=huddle_id_to_huddle_map,
huddle_id_mapper=huddle_id_mapper,
user_id_mapper=user_id_mapper,
subscriber_handler=subscriber_handler,
)
realm["zerver_huddle"] = zerver_huddle
all_users = user_handler.get_all_users()
zerver_recipient = build_recipients(
zerver_userprofile=all_users,
zerver_stream=zerver_stream,
zerver_huddle=zerver_huddle,
)
realm["zerver_recipient"] = zerver_recipient
stream_subscriptions = build_stream_subscriptions(
get_users=subscriber_handler.get_users,
zerver_recipient=zerver_recipient,
zerver_stream=zerver_stream,
)
huddle_subscriptions = build_huddle_subscriptions(
get_users=subscriber_handler.get_users,
zerver_recipient=zerver_recipient,
zerver_huddle=zerver_huddle,
)
personal_subscriptions = build_personal_subscriptions(
zerver_recipient=zerver_recipient,
)
zerver_subscription = personal_subscriptions + stream_subscriptions + huddle_subscriptions
realm["zerver_subscription"] = zerver_subscription
zerver_realmemoji = build_custom_emoji(
realm_id=realm_id,
custom_emoji_data=rocketchat_data["custom_emoji"],
output_dir=output_dir,
)
realm["zerver_realmemoji"] = zerver_realmemoji
subscriber_map = make_subscriber_map(
zerver_subscription=zerver_subscription,
)
stream_id_to_recipient_id: Dict[int, int] = {}
huddle_id_to_recipient_id: Dict[int, int] = {}
user_id_to_recipient_id: Dict[int, int] = {}
map_receiver_id_to_recipient_id(
zerver_recipient=zerver_recipient,
stream_id_to_recipient_id=stream_id_to_recipient_id,
huddle_id_to_recipient_id=huddle_id_to_recipient_id,
user_id_to_recipient_id=user_id_to_recipient_id,
)
channel_messages: List[Dict[str, Any]] = []
private_messages: List[Dict[str, Any]] = []
livechat_messages: List[Dict[str, Any]] = []
separate_channel_private_and_livechat_messages(
messages=rocketchat_data["message"],
dsc_id_to_dsc_map=dsc_id_to_dsc_map,
direct_id_to_direct_map=direct_id_to_direct_map,
huddle_id_to_huddle_map=huddle_id_to_huddle_map,
livechat_id_to_livechat_map=livechat_id_to_livechat_map,
channel_messages=channel_messages,
private_messages=private_messages,
livechat_messages=livechat_messages,
)
total_reactions: List[ZerverFieldsT] = []
uploads_list: List[ZerverFieldsT] = []
zerver_attachment: List[ZerverFieldsT] = []
upload_id_to_upload_data_map = map_upload_id_to_upload_data(rocketchat_data["upload"])
# Process channel messages
process_messages(
realm_id=realm_id,
messages=channel_messages,
subscriber_map=subscriber_map,
is_pm_data=False,
username_to_user_id_map=username_to_user_id_map,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
user_id_to_recipient_id=user_id_to_recipient_id,
stream_id_mapper=stream_id_mapper,
stream_id_to_recipient_id=stream_id_to_recipient_id,
huddle_id_mapper=huddle_id_mapper,
huddle_id_to_recipient_id=huddle_id_to_recipient_id,
thread_id_mapper=thread_id_mapper,
room_id_to_room_map=room_id_to_room_map,
dsc_id_to_dsc_map=dsc_id_to_dsc_map,
direct_id_to_direct_map=direct_id_to_direct_map,
huddle_id_to_huddle_map=huddle_id_to_huddle_map,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
output_dir=output_dir,
)
# Process direct messages
process_messages(
realm_id=realm_id,
messages=private_messages,
subscriber_map=subscriber_map,
is_pm_data=True,
username_to_user_id_map=username_to_user_id_map,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
user_id_to_recipient_id=user_id_to_recipient_id,
stream_id_mapper=stream_id_mapper,
stream_id_to_recipient_id=stream_id_to_recipient_id,
huddle_id_mapper=huddle_id_mapper,
huddle_id_to_recipient_id=huddle_id_to_recipient_id,
thread_id_mapper=thread_id_mapper,
room_id_to_room_map=room_id_to_room_map,
dsc_id_to_dsc_map=dsc_id_to_dsc_map,
direct_id_to_direct_map=direct_id_to_direct_map,
huddle_id_to_huddle_map=huddle_id_to_huddle_map,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
output_dir=output_dir,
)
realm["zerver_reaction"] = total_reactions
realm["zerver_userprofile"] = user_handler.get_all_users()
realm["sort_by_date"] = True
create_converted_data_files(realm, output_dir, "/realm.json")
# TODO: Add support for importing avatars
create_converted_data_files([], output_dir, "/avatars/records.json")
# Import attachments
attachment: Dict[str, List[Any]] = {"zerver_attachment": zerver_attachment}
create_converted_data_files(attachment, output_dir, "/attachment.json")
create_converted_data_files(uploads_list, output_dir, "/uploads/records.json")