data_import: Add import attachments support for Mattermost.

Add support for importing message attachments from Mattermost.

Fixes: #18959
This commit is contained in:
Priyansh Garg 2021-06-28 16:29:16 +05:30 committed by Tim Abbott
parent a35b9fd2d9
commit 5b2e21965c
6 changed files with 216 additions and 21 deletions

View File

@ -36,10 +36,10 @@ Replace `<username>` and `<server_ip>` with the appropriate values below.
3. Create an export of all your Mattermost teams, as a tar file.
```
sudo ./mattermost export bulk export.json --all-teams
sudo ./mattermost export bulk export.json --all-teams --attachments
mkdir -p exported_emoji
tar --transform 's|^|mattermost/|' -czf export.tar.gz \
exported_emoji/ export.json
data/ exported_emoji/ export.json
```
4. Exit your shell on the Mattermost server.
@ -73,11 +73,11 @@ Replace `<username>` and `<server_ip>` with the appropriate values below.
```
docker exec -it mattermost-docker_app_1 mattermost \
export bulk data/export.json --all-teams
export bulk data/export.json --all-teams --attachments
cd volumes/app/mattermost/data/
mkdir -p exported_emoji
tar --transform 's|^|mattermost/|' -czf export.tar.gz \
exported_emoji/ export.json
data/ exported_emoji/ export.json
```
4. Exit your shell on the Mattermost server.
@ -103,10 +103,10 @@ Replace `<username>` and `<server_ip>` with the appropriate values below.
sudo -u \
mattermost /opt/gitlab/embedded/bin/mattermost \
--config=/var/opt/gitlab/mattermost/config.json \
export bulk export.json --all-teams
export bulk export.json --all-teams --attachments
mkdir -p exported_emoji
tar --transform 's|^|mattermost/|' -czf export.tar.gz \
exported_emoji/ export.json
data/ exported_emoji/ export.json
```
3. Exit your shell on the GitLab Omnibus server.
@ -182,7 +182,6 @@ Mattermost's export tool is incomplete and does not support exporting
the following data:
* user avatars
* uploaded files and message attachments.
We expect to add support for importing these data from Mattermost once
Mattermost's export tool includes them.

View File

@ -4,10 +4,12 @@ https://docs.mattermost.com/administration/bulk-export.html
"""
import logging
import os
import random
import re
import secrets
import shutil
import subprocess
from typing import Any, Callable, Dict, List, Set
from typing import Any, Callable, Dict, List, Set, Tuple
import orjson
from django.conf import settings
@ -17,6 +19,7 @@ from django.utils.timezone import now as timezone_now
from zerver.data_import.import_util import (
SubscriberHandler,
ZerverFieldsT,
build_attachment,
build_huddle,
build_huddle_subscriptions,
build_message,
@ -35,6 +38,7 @@ from zerver.data_import.import_util import (
from zerver.data_import.mattermost_user import UserHandler
from zerver.data_import.sequencer import NEXT_ID, IdMapper
from zerver.lib.emoji import name_to_codepoint
from zerver.lib.upload import sanitize_name
from zerver.lib.utils import process_list_in_batches
from zerver.models import Reaction, RealmEmoji, Recipient, UserProfile
@ -311,6 +315,82 @@ def get_mentioned_user_ids(raw_message: Dict[str, Any], user_id_mapper: IdMapper
return user_ids
def process_message_attachments(
attachments: List[Dict[str, Any]],
realm_id: int,
message_id: int,
user_id: int,
user_handler: UserHandler,
zerver_attachment: List[ZerverFieldsT],
uploads_list: List[ZerverFieldsT],
mattermost_data_dir: str,
output_dir: str,
) -> Tuple[str, bool]:
has_image = False
markdown_links = []
for attachment in attachments:
attachment_path = attachment["path"]
attachment_full_path = os.path.join(mattermost_data_dir, "data", attachment_path)
file_name = attachment_path.split("/")[-1]
file_ext = file_name.split(".")[-1]
if file_ext.lower() in ["bmp", "gif", "jpg", "jpeg", "png", "webp"]:
# The file extensions above are taken from `markdown.js`
# variable `backend_only_markdown_re`.
has_image = True
s3_path = "/".join(
[
str(realm_id),
format(random.randint(0, 255), "x"),
secrets.token_urlsafe(18),
sanitize_name(file_name),
]
)
content_for_link = f"[{file_name}](/user_uploads/{s3_path})"
markdown_links.append(content_for_link)
fileinfo = {
"name": file_name,
"size": os.path.getsize(attachment_full_path),
"created": os.path.getmtime(attachment_full_path),
}
upload = dict(
path=s3_path,
realm_id=realm_id,
content_type=None,
user_profile_id=user_id,
last_modified=fileinfo["created"],
user_profile_email=user_handler.get_user(user_id=user_id)["email"],
s3_path=s3_path,
size=fileinfo["size"],
)
uploads_list.append(upload)
build_attachment(
realm_id=realm_id,
message_ids={message_id},
user_id=user_id,
fileinfo=fileinfo,
s3_path=s3_path,
zerver_attachment=zerver_attachment,
)
# Copy the attachment file to output_dir
attachment_out_path = os.path.join(output_dir, "uploads", s3_path)
os.makedirs(os.path.dirname(attachment_out_path), exist_ok=True)
shutil.copyfile(attachment_full_path, attachment_out_path)
content = "\n".join(markdown_links)
return content, has_image
def process_raw_message_batch(
realm_id: int,
raw_messages: List[Dict[str, Any]],
@ -322,6 +402,9 @@ def process_raw_message_batch(
output_dir: str,
zerver_realmemoji: List[Dict[str, Any]],
total_reactions: List[Dict[str, Any]],
uploads_list: List[ZerverFieldsT],
zerver_attachment: List[ZerverFieldsT],
mattermost_data_dir: str,
) -> None:
def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
for user_id in mention_user_ids:
@ -384,6 +467,27 @@ def process_raw_message_batch(
rendered_content = None
has_attachment = False
has_image = False
has_link = False
if "attachments" in raw_message:
has_attachment = True
has_link = True
attachment_markdown, has_image = process_message_attachments(
attachments=raw_message["attachments"],
realm_id=realm_id,
message_id=message_id,
user_id=sender_user_id,
user_handler=user_handler,
zerver_attachment=zerver_attachment,
uploads_list=uploads_list,
mattermost_data_dir=mattermost_data_dir,
output_dir=output_dir,
)
content += attachment_markdown
topic_name = "imported from mattermost"
message = build_message(
@ -394,7 +498,9 @@ def process_raw_message_batch(
rendered_content=rendered_content,
topic_name=topic_name,
user_id=sender_user_id,
has_attachment=False,
has_image=has_image,
has_link=has_link,
has_attachment=has_attachment,
)
zerver_message.append(message)
build_reactions(
@ -435,9 +541,11 @@ def process_posts(
masking_content: bool,
user_id_mapper: IdMapper,
user_handler: UserHandler,
username_to_user: Dict[str, Dict[str, Any]],
zerver_realmemoji: List[Dict[str, Any]],
total_reactions: List[Dict[str, Any]],
uploads_list: List[ZerverFieldsT],
zerver_attachment: List[ZerverFieldsT],
mattermost_data_dir: str,
) -> None:
post_data_list = []
@ -486,6 +594,10 @@ def process_posts(
message_dict["pm_members"] = channel_members
else:
raise AssertionError("Post without channel or channel_members key.")
if post_dict.get("attachments"):
message_dict["attachments"] = post_dict["attachments"]
return message_dict
raw_messages = []
@ -514,6 +626,9 @@ def process_posts(
output_dir=output_dir,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
mattermost_data_dir=mattermost_data_dir,
)
chunk_size = 1000
@ -538,9 +653,11 @@ def write_message_data(
huddle_id_mapper: IdMapper,
user_id_mapper: IdMapper,
user_handler: UserHandler,
username_to_user: Dict[str, Dict[str, Any]],
zerver_realmemoji: List[Dict[str, Any]],
total_reactions: List[Dict[str, Any]],
uploads_list: List[ZerverFieldsT],
zerver_attachment: List[ZerverFieldsT],
mattermost_data_dir: str,
) -> None:
stream_id_to_recipient_id = {}
huddle_id_to_recipient_id = {}
@ -589,9 +706,11 @@ def write_message_data(
masking_content=masking_content,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
username_to_user=username_to_user,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
mattermost_data_dir=mattermost_data_dir,
)
@ -857,6 +976,9 @@ def do_convert_data(mattermost_data_dir: str, output_dir: str, masking_content:
)
total_reactions: List[Dict[str, Any]] = []
uploads_list: List[ZerverFieldsT] = []
zerver_attachment: List[ZerverFieldsT] = []
write_message_data(
num_teams=len(mattermost_data["team"]),
team_name=team_name,
@ -870,9 +992,11 @@ def do_convert_data(mattermost_data_dir: str, output_dir: str, masking_content:
huddle_id_mapper=huddle_id_mapper,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
username_to_user=username_to_user,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
mattermost_data_dir=mattermost_data_dir,
)
realm["zerver_reaction"] = total_reactions
realm["zerver_userprofile"] = user_handler.get_all_users()
@ -881,11 +1005,10 @@ def do_convert_data(mattermost_data_dir: str, output_dir: str, masking_content:
create_converted_data_files(realm, realm_output_dir, "/realm.json")
# Mattermost currently doesn't support exporting avatars
create_converted_data_files([], realm_output_dir, "/avatars/records.json")
# Mattermost currently doesn't support exporting uploads
create_converted_data_files([], realm_output_dir, "/uploads/records.json")
# Mattermost currently doesn't support exporting attachments
attachment: Dict[str, List[Any]] = {"zerver_attachment": []}
# Export message attachments
attachment: Dict[str, List[Any]] = {"zerver_attachment": zerver_attachment}
create_converted_data_files(uploads_list, realm_output_dir, "/uploads/records.json")
create_converted_data_files(attachment, realm_output_dir, "/attachment.json")
logging.info("Start making tarball")

View File

@ -10,10 +10,10 @@
{"type":"post","post":{"team":"gryffindor","channel":"gryffindor-common-room","user":"ron","message":"ron joined the channel.","create_at":1553166512493,"reactions":null,"replies":null}}
{"type":"post","post":{"team":"gryffindor","channel":"gryffindor-quidditch-team","user":"ron","message":"Hey folks","create_at":1553166519720,"reactions":null,"replies":null}}
{"type":"post","post":{"team":"gryffindor","channel":"gryffindor-quidditch-team","user":"harry","message":"@ron Welcome mate!","create_at":1553166519726,"reactions":null,"replies":null}}
{"type":"post","post":{"team":"gryffindor","channel":"gryffindor-common-room","user":"harry","message":"Looks like this channel is empty","create_at":1553166567370,"reactions":[{"user":"ron","create_at":1553166584976,"emoji_name":"rocket"}],"replies":null}}
{"type":"post","post":{"team":"gryffindor","channel":"gryffindor-common-room","user":"harry","message":"Looks like this channel is empty","create_at":1553166567370,"reactions":[{"user":"ron","create_at":1553166584976,"emoji_name":"rocket"}],"replies":null,"attachments":[{"path":"20210622/teams/noteam/channels/mcrm7xee5bnpzn7u9ktsd91dwy/users/knq189b88fdxbdkeeasdynia4o/smaa5epsnp89tgjszzue1691ao/this is a file"}]}}
{"type":"direct_channel","direct_channel":{"members":["ron","harry"],"favorited_by":null,"header":""}}
{"type":"direct_channel","direct_channel":{"members":["ron","harry", "ginny"],"favorited_by":null,"header":""}}
{"type":"direct_post","direct_post":{"channel_members":["ron","harry"],"user":"ron","message":"hey harry","create_at":1566376137676,"flagged_by":null,"reactions":null,"replies":null,"attachments":null}}
{"type":"direct_post","direct_post":{"channel_members":["ron","harry"],"user":"ron","message":"hey harry","create_at":1566376137676,"flagged_by":null,"reactions":null,"replies":null,"attachments":[{"path":"20210622/teams/noteam/channels/mcrm7xee5bnpzn7u9ktsd91dwy/users/knq189b88fdxbdkeeasdynia4o/o3to4ezua3bajj31mzpkn96n5e/harry-ron.jpg"}]}}
{"type":"direct_post","direct_post":{"channel_members":["ron","harry"],"user":"harry","message":"what's up","create_at":1566376318568,"flagged_by":null,"reactions":null,"replies":null,"attachments":null}}
{"type":"direct_post","direct_post":{"channel_members":["ron","harry","ginny"],"user":"ginny","message":"Who is going to Hogsmeade this weekend?","create_at":1566376226493,"flagged_by":null,"reactions":null,"replies":null,"attachments":null}}
{"type":"direct_post","direct_post":{"channel_members":["ron","harry","ginny"],"user":"harry","message":"I am going.","create_at":1566376311350,"flagged_by":null,"reactions":null,"replies":null,"attachments":null}}

View File

@ -5,7 +5,7 @@ from unittest.mock import call, patch
import orjson
from zerver.data_import.import_util import SubscriberHandler
from zerver.data_import.import_util import SubscriberHandler, ZerverFieldsT
from zerver.data_import.mattermost import (
build_reactions,
check_user_in_team,
@ -18,6 +18,7 @@ from zerver.data_import.mattermost import (
get_mentioned_user_ids,
label_mirror_dummy_users,
mattermost_data_file_to_dict,
process_message_attachments,
process_user,
reset_mirror_dummy_users,
write_emoticon_data,
@ -382,6 +383,64 @@ class MatterMostImporter(ZulipTestCase):
)
self.assertTrue(filecmp.cmp(records_json[1]["path"], exported_emoji_path))
def test_process_message_attachments(self) -> None:
mattermost_data_dir = self.fixture_file_name("", "mattermost_fixtures/direct_channel")
output_dir = self.make_import_output_dir("mattermost")
fixture_file_name = self.fixture_file_name(
"export.json", "mattermost_fixtures/direct_channel"
)
mattermost_data = mattermost_data_file_to_dict(fixture_file_name)
username_to_user = create_username_to_user_mapping(mattermost_data["user"])
reset_mirror_dummy_users(username_to_user)
user_handler = UserHandler()
user_id_mapper = IdMapper()
team_name = "gryffindor"
convert_user_data(
user_handler=user_handler,
user_id_mapper=user_id_mapper,
user_data_map=username_to_user,
realm_id=3,
team_name=team_name,
)
zerver_attachments: List[ZerverFieldsT] = []
uploads_list: List[ZerverFieldsT] = []
process_message_attachments(
attachments=mattermost_data["post"]["direct_post"][0]["attachments"],
realm_id=3,
message_id=1,
user_id=2,
user_handler=user_handler,
zerver_attachment=zerver_attachments,
uploads_list=uploads_list,
mattermost_data_dir=mattermost_data_dir,
output_dir=output_dir,
)
self.assert_length(zerver_attachments, 1)
self.assertEqual(zerver_attachments[0]["file_name"], "harry-ron.jpg")
self.assertEqual(zerver_attachments[0]["owner"], 2)
self.assertEqual(
user_handler.get_user(zerver_attachments[0]["owner"])["email"], "ron@zulip.com"
)
# TODO: Assert this for False after fixing the file permissions in PMs
self.assertTrue(zerver_attachments[0]["is_realm_public"])
self.assert_length(uploads_list, 1)
self.assertEqual(uploads_list[0]["user_profile_email"], "ron@zulip.com")
attachment_path = self.fixture_file_name(
mattermost_data["post"]["direct_post"][0]["attachments"][0]["path"],
"mattermost_fixtures/direct_channel/data",
)
attachment_out_path = os.path.join(output_dir, "uploads", zerver_attachments[0]["path_id"])
self.assertTrue(os.path.exists(attachment_out_path))
self.assertTrue(filecmp.cmp(attachment_path, attachment_out_path))
def test_get_mentioned_user_ids(self) -> None:
user_id_mapper = IdMapper()
harry_id = user_id_mapper.get("harry")
@ -680,6 +739,7 @@ class MatterMostImporter(ZulipTestCase):
harry_team_output_dir = self.team_output_dir(output_dir, "gryffindor")
self.assertEqual(os.path.exists(os.path.join(harry_team_output_dir, "avatars")), True)
self.assertEqual(os.path.exists(os.path.join(harry_team_output_dir, "emoji")), True)
self.assertEqual(os.path.exists(os.path.join(harry_team_output_dir, "uploads")), True)
self.assertEqual(
os.path.exists(os.path.join(harry_team_output_dir, "attachment.json")), True
)
@ -763,6 +823,15 @@ class MatterMostImporter(ZulipTestCase):
self.assertEqual(stream_messages[0].sender.email, "ron@zulip.com")
self.assertEqual(stream_messages[0].content, "ron joined the channel.\n\n")
self.assertEqual(stream_messages[3].sender.email, "harry@zulip.com")
self.assertRegex(
stream_messages[3].content,
"Looks like this channel is empty\n\n\\[this is a file\\]\\(.*\\)",
)
self.assertTrue(stream_messages[3].has_attachment)
self.assertFalse(stream_messages[3].has_image)
self.assertTrue(stream_messages[3].has_link)
huddle_messages = messages.filter(recipient__type=Recipient.HUDDLE).order_by("date_sent")
huddle_recipients = huddle_messages.values_list("recipient", flat=True)
self.assert_length(huddle_messages, 3)
@ -777,7 +846,10 @@ class MatterMostImporter(ZulipTestCase):
self.assert_length(personal_messages, 4)
self.assert_length(set(personal_recipients), 3)
self.assertEqual(personal_messages[0].sender.email, "ron@zulip.com")
self.assertEqual(personal_messages[0].content, "hey harry\n\n")
self.assertRegex(personal_messages[0].content, "hey harry\n\n\\[harry-ron.jpg\\]\\(.*\\)")
self.assertTrue(personal_messages[0].has_attachment)
self.assertTrue(personal_messages[0].has_image)
self.assertTrue(personal_messages[0].has_link)
def test_do_convert_data_with_masking(self) -> None:
mattermost_data_dir = self.fixture_file_name("", "mattermost_fixtures")