mirror of https://github.com/zulip/zulip.git
883 lines
28 KiB
Python
Executable File
883 lines
28 KiB
Python
Executable File
import base64
|
|
import glob
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
from typing import Any, Callable, Dict, List, Optional, Set
|
|
|
|
import dateutil
|
|
import hypchat
|
|
import orjson
|
|
from django.conf import settings
|
|
from django.utils.timezone import now as timezone_now
|
|
|
|
from zerver.data_import.hipchat_attachment import AttachmentHandler
|
|
from zerver.data_import.hipchat_user import UserHandler
|
|
from zerver.data_import.import_util import (
|
|
SubscriberHandler,
|
|
build_message,
|
|
build_personal_subscriptions,
|
|
build_public_stream_subscriptions,
|
|
build_realm,
|
|
build_realm_emoji,
|
|
build_recipients,
|
|
build_stream,
|
|
build_stream_subscriptions,
|
|
build_user_profile,
|
|
build_zerver_realm,
|
|
create_converted_data_files,
|
|
make_subscriber_map,
|
|
make_user_messages,
|
|
write_avatar_png,
|
|
)
|
|
from zerver.data_import.sequencer import NEXT_ID, IdMapper
|
|
from zerver.lib.utils import process_list_in_batches
|
|
from zerver.models import RealmEmoji, Recipient, UserProfile
|
|
|
|
# stubs
|
|
ZerverFieldsT = Dict[str, Any]
|
|
|
|
def str_date_to_float(date_str: str) -> float:
|
|
'''
|
|
Dates look like this:
|
|
|
|
"2018-08-08T14:23:54Z 626267"
|
|
'''
|
|
|
|
parts = date_str.split(' ')
|
|
time_str = parts[0].replace('T', ' ')
|
|
date_time = dateutil.parser.parse(time_str)
|
|
timestamp = date_time.timestamp()
|
|
if len(parts) == 2:
|
|
microseconds = int(parts[1])
|
|
timestamp += microseconds / 1000000.0
|
|
return timestamp
|
|
|
|
def untar_input_file(tar_file: str) -> str:
|
|
data_dir = tar_file.replace('.tar', '')
|
|
data_dir = os.path.abspath(data_dir)
|
|
|
|
if os.path.exists(data_dir):
|
|
logging.info('input data was already untarred to %s, we will use it', data_dir)
|
|
return data_dir
|
|
|
|
os.makedirs(data_dir)
|
|
|
|
subprocess.check_call(['tar', '-xf', tar_file, '-C', data_dir])
|
|
|
|
logging.info('input data was untarred to %s', data_dir)
|
|
|
|
return data_dir
|
|
|
|
def read_user_data(data_dir: str) -> List[ZerverFieldsT]:
|
|
fn = 'users.json'
|
|
data_file = os.path.join(data_dir, fn)
|
|
with open(data_file, "rb") as fp:
|
|
return orjson.loads(fp.read())
|
|
|
|
def convert_user_data(user_handler: UserHandler,
|
|
slim_mode: bool,
|
|
user_id_mapper: IdMapper,
|
|
raw_data: List[ZerverFieldsT],
|
|
realm_id: int) -> None:
|
|
flat_data = [
|
|
d['User']
|
|
for d in raw_data
|
|
]
|
|
|
|
def process(in_dict: ZerverFieldsT) -> ZerverFieldsT:
|
|
delivery_email = in_dict['email']
|
|
email = in_dict['email']
|
|
full_name = in_dict['name']
|
|
id = user_id_mapper.get(in_dict['id'])
|
|
is_mirror_dummy = False
|
|
short_name = in_dict['mention_name']
|
|
timezone = in_dict['timezone']
|
|
|
|
role = UserProfile.ROLE_MEMBER
|
|
if in_dict['account_type'] == 'admin':
|
|
role = UserProfile.ROLE_REALM_ADMINISTRATOR
|
|
if in_dict['account_type'] == 'guest':
|
|
role = UserProfile.ROLE_GUEST
|
|
|
|
date_joined = int(timezone_now().timestamp())
|
|
is_active = not in_dict['is_deleted']
|
|
|
|
if not email:
|
|
if role == UserProfile.ROLE_GUEST:
|
|
# Hipchat guest users don't have emails, so
|
|
# we just fake them.
|
|
email = f'guest-{id}@example.com'
|
|
delivery_email = email
|
|
else:
|
|
# Hipchat sometimes doesn't export an email for deactivated users.
|
|
assert not is_active
|
|
email = delivery_email = f"deactivated-{id}@example.com"
|
|
|
|
# unmapped fields:
|
|
# title - Developer, Project Manager, etc.
|
|
# rooms - no good sample data
|
|
# created - we just use "now"
|
|
# roles - we just use account_type
|
|
|
|
if in_dict.get('avatar'):
|
|
avatar_source = 'U'
|
|
else:
|
|
avatar_source = 'G'
|
|
|
|
return build_user_profile(
|
|
avatar_source=avatar_source,
|
|
date_joined=date_joined,
|
|
delivery_email=delivery_email,
|
|
email=email,
|
|
full_name=full_name,
|
|
id=id,
|
|
is_active=is_active,
|
|
role=role,
|
|
is_mirror_dummy=is_mirror_dummy,
|
|
realm_id=realm_id,
|
|
short_name=short_name,
|
|
timezone=timezone,
|
|
)
|
|
|
|
for raw_item in flat_data:
|
|
user = process(raw_item)
|
|
user_handler.add_user(user)
|
|
|
|
def convert_avatar_data(avatar_folder: str,
|
|
raw_data: List[ZerverFieldsT],
|
|
user_id_mapper: IdMapper,
|
|
realm_id: int) -> List[ZerverFieldsT]:
|
|
'''
|
|
This code is pretty specific to how Hipchat sends us data.
|
|
They give us the avatar payloads in base64 in users.json.
|
|
|
|
We process avatars in our own pass of that data, rather
|
|
than doing it while we're getting other user data. I
|
|
chose to keep this separate, as otherwise you have a lot
|
|
of extraneous data getting passed around.
|
|
|
|
This code has MAJOR SIDE EFFECTS--namely writing a bunch
|
|
of files to the avatars directory.
|
|
'''
|
|
|
|
avatar_records = []
|
|
|
|
for d in raw_data:
|
|
raw_user = d['User']
|
|
avatar_payload = raw_user.get('avatar')
|
|
if not avatar_payload:
|
|
continue
|
|
|
|
bits = base64.b64decode(avatar_payload)
|
|
|
|
raw_user_id = raw_user['id']
|
|
if not user_id_mapper.has(raw_user_id):
|
|
continue
|
|
|
|
user_id = user_id_mapper.get(raw_user_id)
|
|
|
|
metadata = write_avatar_png(
|
|
avatar_folder=avatar_folder,
|
|
realm_id=realm_id,
|
|
user_id=user_id,
|
|
bits=bits,
|
|
)
|
|
avatar_records.append(metadata)
|
|
|
|
return avatar_records
|
|
|
|
def read_room_data(data_dir: str) -> List[ZerverFieldsT]:
|
|
fn = 'rooms.json'
|
|
data_file = os.path.join(data_dir, fn)
|
|
with open(data_file, "rb") as f:
|
|
data = orjson.loads(f.read())
|
|
return data
|
|
|
|
def convert_room_data(raw_data: List[ZerverFieldsT],
|
|
subscriber_handler: SubscriberHandler,
|
|
stream_id_mapper: IdMapper,
|
|
user_id_mapper: IdMapper,
|
|
realm_id: int,
|
|
api_token: Optional[str]=None) -> List[ZerverFieldsT]:
|
|
flat_data = [
|
|
d['Room']
|
|
for d in raw_data
|
|
]
|
|
|
|
def get_invite_only(v: str) -> bool:
|
|
if v == 'public':
|
|
return False
|
|
elif v == 'private':
|
|
return True
|
|
else:
|
|
raise Exception('unexpected value')
|
|
|
|
streams = []
|
|
|
|
for in_dict in flat_data:
|
|
now = int(timezone_now().timestamp())
|
|
stream_id = stream_id_mapper.get(in_dict['id'])
|
|
|
|
invite_only = get_invite_only(in_dict['privacy'])
|
|
|
|
stream = build_stream(
|
|
date_created=now,
|
|
realm_id=realm_id,
|
|
name=in_dict['name'],
|
|
description=in_dict['topic'],
|
|
stream_id=stream_id,
|
|
deactivated=in_dict['is_archived'],
|
|
invite_only=invite_only,
|
|
)
|
|
|
|
if invite_only:
|
|
users: Set[int] = {
|
|
user_id_mapper.get(key)
|
|
for key in in_dict['members']
|
|
if user_id_mapper.has(key)
|
|
}
|
|
|
|
if user_id_mapper.has(in_dict['owner']):
|
|
owner = user_id_mapper.get(in_dict['owner'])
|
|
users.add(owner)
|
|
else:
|
|
users = set()
|
|
if api_token is not None:
|
|
hc = hypchat.HypChat(api_token)
|
|
room_data = hc.fromurl('{}/v2/room/{}/member'.format(hc.endpoint, in_dict['id']))
|
|
|
|
for item in room_data['items']:
|
|
hipchat_user_id = item['id']
|
|
zulip_user_id = user_id_mapper.get(hipchat_user_id)
|
|
users.add(zulip_user_id)
|
|
|
|
if users:
|
|
subscriber_handler.set_info(
|
|
stream_id=stream_id,
|
|
users=users,
|
|
)
|
|
|
|
# unmapped fields:
|
|
# guest_access_url: no Zulip equivalent
|
|
# created: we just use "now"
|
|
# participants: no good sample data
|
|
|
|
streams.append(stream)
|
|
|
|
return streams
|
|
|
|
def make_realm(realm_id: int) -> ZerverFieldsT:
|
|
NOW = float(timezone_now().timestamp())
|
|
domain_name = settings.EXTERNAL_HOST
|
|
realm_subdomain = ""
|
|
zerver_realm = build_zerver_realm(realm_id, realm_subdomain, NOW, 'HipChat')
|
|
realm = build_realm(zerver_realm, realm_id, domain_name)
|
|
|
|
# We may override these later.
|
|
realm['zerver_defaultstream'] = []
|
|
|
|
return realm
|
|
|
|
def write_avatar_data(raw_user_data: List[ZerverFieldsT],
|
|
output_dir: str,
|
|
user_id_mapper: IdMapper,
|
|
realm_id: int) -> None:
|
|
avatar_folder = os.path.join(output_dir, 'avatars')
|
|
avatar_realm_folder = os.path.join(avatar_folder, str(realm_id))
|
|
os.makedirs(avatar_realm_folder, exist_ok=True)
|
|
|
|
avatar_records = convert_avatar_data(
|
|
avatar_folder=avatar_folder,
|
|
raw_data=raw_user_data,
|
|
user_id_mapper=user_id_mapper,
|
|
realm_id=realm_id,
|
|
)
|
|
|
|
create_converted_data_files(avatar_records, output_dir, '/avatars/records.json')
|
|
|
|
def write_emoticon_data(realm_id: int,
|
|
data_dir: str,
|
|
output_dir: str) -> List[ZerverFieldsT]:
|
|
'''
|
|
This function does most of the work for processing emoticons, the bulk
|
|
of which is copying files. We also write a json file with metadata.
|
|
Finally, we return a list of RealmEmoji dicts to our caller.
|
|
|
|
In our data_dir we have a pretty simple setup:
|
|
|
|
emoticons.json - has very simple metadata on emojis:
|
|
|
|
{
|
|
"Emoticon": {
|
|
"id": 9875487,
|
|
"path": "emoticons/yasss.jpg",
|
|
"shortcut": "yasss"
|
|
}
|
|
},
|
|
{
|
|
"Emoticon": {
|
|
"id": 718017,
|
|
"path": "emoticons/yayyyyy.gif",
|
|
"shortcut": "yayyyyy"
|
|
}
|
|
}
|
|
|
|
emoticons/ - contains a bunch of image files:
|
|
|
|
slytherinsnake.gif
|
|
spanishinquisition.jpg
|
|
sparkle.png
|
|
spiderman.gif
|
|
stableparrot.gif
|
|
stalkerparrot.gif
|
|
supergirl.png
|
|
superman.png
|
|
|
|
We move all the relevant files to Zulip's more nested
|
|
directory structure.
|
|
'''
|
|
|
|
logging.info('Starting to process emoticons')
|
|
|
|
fn = 'emoticons.json'
|
|
data_file = os.path.join(data_dir, fn)
|
|
if not os.path.exists(data_file):
|
|
logging.warning("HipChat export does not contain emoticons.json.")
|
|
logging.warning("As a result, custom emoji cannot be imported.")
|
|
return []
|
|
|
|
with open(data_file, "rb") as f:
|
|
data = orjson.loads(f.read())
|
|
|
|
if isinstance(data, dict) and 'Emoticons' in data:
|
|
# Handle the hc-migrate export format for emoticons.json.
|
|
flat_data = [
|
|
dict(
|
|
path=d['path'],
|
|
name=d['shortcut'],
|
|
)
|
|
for d in data['Emoticons']
|
|
]
|
|
else:
|
|
flat_data = [
|
|
dict(
|
|
path=d['Emoticon']['path'],
|
|
name=d['Emoticon']['shortcut'],
|
|
)
|
|
for d in data
|
|
]
|
|
|
|
emoji_folder = os.path.join(output_dir, 'emoji')
|
|
os.makedirs(emoji_folder, exist_ok=True)
|
|
|
|
def process(data: ZerverFieldsT) -> ZerverFieldsT:
|
|
source_sub_path = data['path']
|
|
source_fn = os.path.basename(source_sub_path)
|
|
source_path = os.path.join(data_dir, source_sub_path)
|
|
|
|
# Use our template from RealmEmoji
|
|
# PATH_ID_TEMPLATE = "{realm_id}/emoji/images/{emoji_file_name}"
|
|
target_fn = source_fn
|
|
target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format(
|
|
realm_id=realm_id,
|
|
emoji_file_name=target_fn,
|
|
)
|
|
target_path = os.path.join(emoji_folder, target_sub_path)
|
|
|
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
|
|
|
source_path = os.path.abspath(source_path)
|
|
target_path = os.path.abspath(target_path)
|
|
|
|
shutil.copyfile(source_path, target_path)
|
|
|
|
return dict(
|
|
path=target_path,
|
|
s3_path=target_path,
|
|
file_name=target_fn,
|
|
realm_id=realm_id,
|
|
name=data['name'],
|
|
)
|
|
|
|
emoji_records = list(map(process, flat_data))
|
|
create_converted_data_files(emoji_records, output_dir, '/emoji/records.json')
|
|
|
|
realmemoji = [
|
|
build_realm_emoji(
|
|
realm_id=realm_id,
|
|
name=rec['name'],
|
|
id=NEXT_ID('realmemoji'),
|
|
file_name=rec['file_name'],
|
|
)
|
|
for rec in emoji_records
|
|
]
|
|
logging.info('Done processing emoticons')
|
|
|
|
return realmemoji
|
|
|
|
def write_message_data(realm_id: int,
|
|
slim_mode: bool,
|
|
message_key: str,
|
|
zerver_recipient: List[ZerverFieldsT],
|
|
subscriber_map: Dict[int, Set[int]],
|
|
data_dir: str,
|
|
output_dir: str,
|
|
masking_content: bool,
|
|
stream_id_mapper: IdMapper,
|
|
user_id_mapper: IdMapper,
|
|
user_handler: UserHandler,
|
|
attachment_handler: AttachmentHandler) -> None:
|
|
|
|
stream_id_to_recipient_id = {
|
|
d['type_id']: d['id']
|
|
for d in zerver_recipient
|
|
if d['type'] == Recipient.STREAM
|
|
}
|
|
|
|
user_id_to_recipient_id = {
|
|
d['type_id']: d['id']
|
|
for d in zerver_recipient
|
|
if d['type'] == Recipient.PERSONAL
|
|
}
|
|
|
|
def get_stream_recipient_id(raw_message: ZerverFieldsT) -> int:
|
|
fn_id = raw_message['fn_id']
|
|
stream_id = stream_id_mapper.get(fn_id)
|
|
recipient_id = stream_id_to_recipient_id[stream_id]
|
|
return recipient_id
|
|
|
|
def get_pm_recipient_id(raw_message: ZerverFieldsT) -> int:
|
|
raw_user_id = raw_message['receiver_id']
|
|
assert(raw_user_id)
|
|
user_id = user_id_mapper.get(raw_user_id)
|
|
recipient_id = user_id_to_recipient_id[user_id]
|
|
return recipient_id
|
|
|
|
if message_key in ['UserMessage', 'NotificationMessage']:
|
|
is_pm_data = False
|
|
dir_glob = os.path.join(data_dir, 'rooms', '*', 'history.json')
|
|
get_recipient_id = get_stream_recipient_id
|
|
get_files_dir = lambda fn_id: os.path.join(data_dir, 'rooms', str(fn_id), 'files')
|
|
|
|
elif message_key == 'PrivateUserMessage':
|
|
is_pm_data = True
|
|
dir_glob = os.path.join(data_dir, 'users', '*', 'history.json')
|
|
get_recipient_id = get_pm_recipient_id
|
|
get_files_dir = lambda fn_id: os.path.join(data_dir, 'users', 'files')
|
|
|
|
else:
|
|
raise Exception('programming error: invalid message_key: ' + message_key)
|
|
|
|
history_files = glob.glob(dir_glob)
|
|
for fn in history_files:
|
|
dir = os.path.dirname(fn)
|
|
fn_id = os.path.basename(dir)
|
|
files_dir = get_files_dir(fn_id)
|
|
|
|
process_message_file(
|
|
realm_id=realm_id,
|
|
slim_mode=slim_mode,
|
|
fn=fn,
|
|
fn_id=fn_id,
|
|
files_dir=files_dir,
|
|
get_recipient_id=get_recipient_id,
|
|
message_key=message_key,
|
|
subscriber_map=subscriber_map,
|
|
data_dir=data_dir,
|
|
output_dir=output_dir,
|
|
is_pm_data=is_pm_data,
|
|
masking_content=masking_content,
|
|
user_id_mapper=user_id_mapper,
|
|
user_handler=user_handler,
|
|
attachment_handler=attachment_handler,
|
|
)
|
|
|
|
def get_hipchat_sender_id(realm_id: int,
|
|
slim_mode: bool,
|
|
message_dict: Dict[str, Any],
|
|
user_id_mapper: IdMapper,
|
|
user_handler: UserHandler) -> Optional[int]:
|
|
'''
|
|
The HipChat export is inconsistent in how it renders
|
|
senders, and sometimes we don't even get an id.
|
|
'''
|
|
if isinstance(message_dict['sender'], str):
|
|
if slim_mode:
|
|
return None
|
|
# Some Hipchat instances just give us a person's
|
|
# name in the sender field for NotificationMessage.
|
|
# We turn them into a mirror user.
|
|
mirror_user = user_handler.get_mirror_user(
|
|
realm_id=realm_id,
|
|
name=message_dict['sender'],
|
|
)
|
|
sender_id = mirror_user['id']
|
|
return sender_id
|
|
|
|
raw_sender_id = message_dict['sender']['id']
|
|
|
|
if raw_sender_id == 0:
|
|
if slim_mode:
|
|
return None
|
|
mirror_user = user_handler.get_mirror_user(
|
|
realm_id=realm_id,
|
|
name=message_dict['sender']['name'],
|
|
)
|
|
sender_id = mirror_user['id']
|
|
return sender_id
|
|
|
|
if not user_id_mapper.has(raw_sender_id):
|
|
if slim_mode:
|
|
return None
|
|
mirror_user = user_handler.get_mirror_user(
|
|
realm_id=realm_id,
|
|
name=message_dict['sender']['id'],
|
|
)
|
|
sender_id = mirror_user['id']
|
|
return sender_id
|
|
|
|
# HAPPY PATH: Hipchat just gave us an ordinary
|
|
# sender_id.
|
|
sender_id = user_id_mapper.get(raw_sender_id)
|
|
return sender_id
|
|
|
|
def process_message_file(realm_id: int,
|
|
slim_mode: bool,
|
|
fn: str,
|
|
fn_id: str,
|
|
files_dir: str,
|
|
get_recipient_id: Callable[[ZerverFieldsT], int],
|
|
message_key: str,
|
|
subscriber_map: Dict[int, Set[int]],
|
|
data_dir: str,
|
|
output_dir: str,
|
|
is_pm_data: bool,
|
|
masking_content: bool,
|
|
user_id_mapper: IdMapper,
|
|
user_handler: UserHandler,
|
|
attachment_handler: AttachmentHandler) -> None:
|
|
|
|
def get_raw_messages(fn: str) -> List[ZerverFieldsT]:
|
|
with open(fn, "rb") as f:
|
|
data = orjson.loads(f.read())
|
|
|
|
flat_data = [
|
|
d[message_key]
|
|
for d in data
|
|
if message_key in d
|
|
]
|
|
|
|
def get_raw_message(d: Dict[str, Any]) -> Optional[ZerverFieldsT]:
|
|
sender_id = get_hipchat_sender_id(
|
|
realm_id=realm_id,
|
|
slim_mode=slim_mode,
|
|
message_dict=d,
|
|
user_id_mapper=user_id_mapper,
|
|
user_handler=user_handler,
|
|
)
|
|
|
|
if sender_id is None:
|
|
return None
|
|
|
|
if is_pm_data:
|
|
# We need to compare with str() on both sides here.
|
|
# In Stride, user IDs are strings, but in HipChat,
|
|
# they are integers, and fn_id is always a string.
|
|
if str(sender_id) != str(fn_id):
|
|
# PMs are in multiple places in the Hipchat export,
|
|
# and we only use the copy from the sender
|
|
return None
|
|
|
|
content = d['message']
|
|
|
|
if masking_content:
|
|
content = re.sub('[a-z]', 'x', content)
|
|
content = re.sub('[A-Z]', 'X', content)
|
|
|
|
return dict(
|
|
fn_id=fn_id,
|
|
sender_id=sender_id,
|
|
receiver_id=d.get('receiver', {}).get('id'),
|
|
content=content,
|
|
mention_user_ids=d.get('mentions', []),
|
|
date_sent=str_date_to_float(d['timestamp']),
|
|
attachment=d.get('attachment'),
|
|
files_dir=files_dir,
|
|
)
|
|
|
|
raw_messages = []
|
|
|
|
for d in flat_data:
|
|
raw_message = get_raw_message(d)
|
|
if raw_message is not None:
|
|
raw_messages.append(raw_message)
|
|
|
|
return raw_messages
|
|
|
|
raw_messages = get_raw_messages(fn)
|
|
|
|
def process_batch(lst: List[Any]) -> None:
|
|
process_raw_message_batch(
|
|
realm_id=realm_id,
|
|
raw_messages=lst,
|
|
subscriber_map=subscriber_map,
|
|
user_id_mapper=user_id_mapper,
|
|
user_handler=user_handler,
|
|
attachment_handler=attachment_handler,
|
|
get_recipient_id=get_recipient_id,
|
|
is_pm_data=is_pm_data,
|
|
output_dir=output_dir,
|
|
)
|
|
|
|
chunk_size = 1000
|
|
|
|
process_list_in_batches(
|
|
lst=raw_messages,
|
|
chunk_size=chunk_size,
|
|
process_batch=process_batch,
|
|
)
|
|
|
|
def process_raw_message_batch(realm_id: int,
|
|
raw_messages: List[Dict[str, Any]],
|
|
subscriber_map: Dict[int, Set[int]],
|
|
user_id_mapper: IdMapper,
|
|
user_handler: UserHandler,
|
|
attachment_handler: AttachmentHandler,
|
|
get_recipient_id: Callable[[ZerverFieldsT], int],
|
|
is_pm_data: bool,
|
|
output_dir: str) -> None:
|
|
|
|
def fix_mentions(content: str,
|
|
mention_user_ids: Set[int]) -> str:
|
|
for user_id in mention_user_ids:
|
|
user = user_handler.get_user(user_id=user_id)
|
|
hipchat_mention = '@{short_name}'.format(**user)
|
|
zulip_mention = '@**{full_name}**'.format(**user)
|
|
content = content.replace(hipchat_mention, zulip_mention)
|
|
|
|
content = content.replace('@here', '@**all**')
|
|
return content
|
|
|
|
mention_map: Dict[int, Set[int]] = {}
|
|
|
|
zerver_message = []
|
|
|
|
import html2text
|
|
h = html2text.HTML2Text()
|
|
|
|
for raw_message in raw_messages:
|
|
# One side effect here:
|
|
|
|
message_id = NEXT_ID('message')
|
|
mention_user_ids = {
|
|
user_id_mapper.get(id)
|
|
for id in set(raw_message['mention_user_ids'])
|
|
if user_id_mapper.has(id)
|
|
}
|
|
mention_map[message_id] = mention_user_ids
|
|
|
|
content = fix_mentions(
|
|
content=raw_message['content'],
|
|
mention_user_ids=mention_user_ids,
|
|
)
|
|
content = h.handle(content)
|
|
|
|
if len(content) > 10000:
|
|
logging.info('skipping too-long message of length %s', len(content))
|
|
continue
|
|
|
|
date_sent = raw_message['date_sent']
|
|
|
|
try:
|
|
recipient_id = get_recipient_id(raw_message)
|
|
except KeyError:
|
|
logging.debug("Could not find recipient_id for a message, skipping.")
|
|
continue
|
|
|
|
rendered_content = None
|
|
|
|
if is_pm_data:
|
|
topic_name = ''
|
|
else:
|
|
topic_name = 'imported from hipchat'
|
|
user_id = raw_message['sender_id']
|
|
|
|
# Another side effect:
|
|
extra_content = attachment_handler.handle_message_data(
|
|
realm_id=realm_id,
|
|
message_id=message_id,
|
|
sender_id=user_id,
|
|
attachment=raw_message['attachment'],
|
|
files_dir=raw_message['files_dir'],
|
|
)
|
|
|
|
if extra_content:
|
|
has_attachment = True
|
|
content += '\n' + extra_content
|
|
else:
|
|
has_attachment = False
|
|
|
|
message = build_message(
|
|
content=content,
|
|
message_id=message_id,
|
|
date_sent=date_sent,
|
|
recipient_id=recipient_id,
|
|
rendered_content=rendered_content,
|
|
topic_name=topic_name,
|
|
user_id=user_id,
|
|
has_attachment=has_attachment,
|
|
)
|
|
zerver_message.append(message)
|
|
|
|
zerver_usermessage = make_user_messages(
|
|
zerver_message=zerver_message,
|
|
subscriber_map=subscriber_map,
|
|
is_pm_data=is_pm_data,
|
|
mention_map=mention_map,
|
|
)
|
|
|
|
message_json = dict(
|
|
zerver_message=zerver_message,
|
|
zerver_usermessage=zerver_usermessage,
|
|
)
|
|
|
|
dump_file_id = NEXT_ID('dump_file_id')
|
|
message_file = f"/messages-{dump_file_id:06}.json"
|
|
create_converted_data_files(message_json, output_dir, message_file)
|
|
|
|
def do_convert_data(input_tar_file: str,
|
|
output_dir: str,
|
|
masking_content: bool,
|
|
api_token: Optional[str]=None,
|
|
slim_mode: bool=False) -> None:
|
|
input_data_dir = untar_input_file(input_tar_file)
|
|
|
|
attachment_handler = AttachmentHandler()
|
|
user_handler = UserHandler()
|
|
subscriber_handler = SubscriberHandler()
|
|
user_id_mapper = IdMapper()
|
|
stream_id_mapper = IdMapper()
|
|
|
|
realm_id = 0
|
|
realm = make_realm(realm_id=realm_id)
|
|
|
|
# users.json -> UserProfile
|
|
raw_user_data = read_user_data(data_dir=input_data_dir)
|
|
convert_user_data(
|
|
user_handler=user_handler,
|
|
slim_mode=slim_mode,
|
|
user_id_mapper=user_id_mapper,
|
|
raw_data=raw_user_data,
|
|
realm_id=realm_id,
|
|
)
|
|
normal_users = user_handler.get_normal_users()
|
|
# Don't write zerver_userprofile here, because we
|
|
# may add more users later.
|
|
|
|
# streams.json -> Stream
|
|
raw_stream_data = read_room_data(data_dir=input_data_dir)
|
|
zerver_stream = convert_room_data(
|
|
raw_data=raw_stream_data,
|
|
subscriber_handler=subscriber_handler,
|
|
stream_id_mapper=stream_id_mapper,
|
|
user_id_mapper=user_id_mapper,
|
|
realm_id=realm_id,
|
|
api_token=api_token,
|
|
)
|
|
realm['zerver_stream'] = zerver_stream
|
|
|
|
zerver_recipient = build_recipients(
|
|
zerver_userprofile=normal_users,
|
|
zerver_stream=zerver_stream,
|
|
)
|
|
realm['zerver_recipient'] = zerver_recipient
|
|
|
|
if api_token is None:
|
|
if slim_mode:
|
|
public_stream_subscriptions: List[ZerverFieldsT] = []
|
|
else:
|
|
public_stream_subscriptions = build_public_stream_subscriptions(
|
|
zerver_userprofile=normal_users,
|
|
zerver_recipient=zerver_recipient,
|
|
zerver_stream=zerver_stream,
|
|
)
|
|
|
|
private_stream_subscriptions = build_stream_subscriptions(
|
|
get_users=subscriber_handler.get_users,
|
|
zerver_recipient=zerver_recipient,
|
|
zerver_stream=[stream_dict for stream_dict in zerver_stream
|
|
if stream_dict['invite_only']],
|
|
)
|
|
stream_subscriptions = public_stream_subscriptions + private_stream_subscriptions
|
|
else:
|
|
stream_subscriptions = build_stream_subscriptions(
|
|
get_users=subscriber_handler.get_users,
|
|
zerver_recipient=zerver_recipient,
|
|
zerver_stream=zerver_stream,
|
|
)
|
|
|
|
personal_subscriptions = build_personal_subscriptions(
|
|
zerver_recipient=zerver_recipient,
|
|
)
|
|
zerver_subscription = personal_subscriptions + stream_subscriptions
|
|
|
|
realm['zerver_subscription'] = zerver_subscription
|
|
|
|
zerver_realmemoji = write_emoticon_data(
|
|
realm_id=realm_id,
|
|
data_dir=input_data_dir,
|
|
output_dir=output_dir,
|
|
)
|
|
realm['zerver_realmemoji'] = zerver_realmemoji
|
|
|
|
subscriber_map = make_subscriber_map(
|
|
zerver_subscription=zerver_subscription,
|
|
)
|
|
|
|
logging.info('Start importing message data')
|
|
for message_key in ['UserMessage',
|
|
'NotificationMessage',
|
|
'PrivateUserMessage']:
|
|
write_message_data(
|
|
realm_id=realm_id,
|
|
slim_mode=slim_mode,
|
|
message_key=message_key,
|
|
zerver_recipient=zerver_recipient,
|
|
subscriber_map=subscriber_map,
|
|
data_dir=input_data_dir,
|
|
output_dir=output_dir,
|
|
masking_content=masking_content,
|
|
stream_id_mapper=stream_id_mapper,
|
|
user_id_mapper=user_id_mapper,
|
|
user_handler=user_handler,
|
|
attachment_handler=attachment_handler,
|
|
)
|
|
|
|
# Order is important here...don't write users until
|
|
# we process everything else, since we may introduce
|
|
# mirror users when processing messages.
|
|
realm['zerver_userprofile'] = user_handler.get_all_users()
|
|
realm['sort_by_date'] = True
|
|
|
|
create_converted_data_files(realm, output_dir, '/realm.json')
|
|
|
|
logging.info('Start importing avatar data')
|
|
write_avatar_data(
|
|
raw_user_data=raw_user_data,
|
|
output_dir=output_dir,
|
|
user_id_mapper=user_id_mapper,
|
|
realm_id=realm_id,
|
|
)
|
|
|
|
attachment_handler.write_info(
|
|
output_dir=output_dir,
|
|
realm_id=realm_id,
|
|
)
|
|
|
|
logging.info('Start making tarball')
|
|
subprocess.check_call(["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P'])
|
|
logging.info('Done making tarball')
|