hipchat_import: Remove tool from codebase.

Remove functions and scripts used by HipChat import tool and
those which will no longer be required in future.
This commit is contained in:
Aman Agrawal 2020-12-23 14:30:24 +05:30 committed by Tim Abbott
parent 62d721e859
commit c685d36821
19 changed files with 17 additions and 1291 deletions

View File

@ -27,6 +27,7 @@ in bursts.
#### Full feature changelog
- Removed HipChat import tool.
- Added support for moving topics to private streams.
- Added support for subscribing another stream's membership to a stream.
- Added RealmAuditLog for most settings state changes in Zulip; this

View File

@ -78,7 +78,6 @@ backup][zulip-backups] or importing your data from [Slack][slack-import],
or another Zulip server, you should stop here
and return to the import instructions.
[hipchat-import]: https://zulip.com/help/import-from-hipchat
[slack-import]: https://zulip.com/help/import-from-slack
[zulip-backups]: ../production/export-and-import.html#backups

View File

@ -59,9 +59,6 @@ httplib2
# Forked to avoid pulling in scipy: https://github.com/mailgun/talon/issues/130
https://github.com/zulip/talon/archive/7d8bdc4dbcfcc5a73298747293b99fe53da55315.zip#egg=talon==1.2.10.zulip1
# Needed for HipChat import
hypchat
# Needed for inlining the CSS in emails
premailer

View File

@ -355,9 +355,6 @@ httplib2==0.18.1 \
--hash=sha256:8af66c1c52c7ffe1aa5dc4bcd7c769885254b0756e6e69f953c7f0ab49a70ba3 \
--hash=sha256:ca2914b015b6247791c4866782fa6042f495b94401a0f0bd3e1d6e0ba2236782 \
# via -r requirements/common.in
hypchat==0.21 \
--hash=sha256:ef37a9cd8103bb13ad772b28ba9223ca9d4278371e374450c3ea2918df70a8e9 \
# via -r requirements/common.in
hyper==0.7.0 \
--hash=sha256:069514f54231fb7b5df2fb910a114663a83306d5296f588fffcb0a9be19407fc \
--hash=sha256:12c82eacd122a659673484c1ea0d34576430afbe5aa6b8f63fe37fcb06a2458c \
@ -915,7 +912,7 @@ python-binary-memcached==0.30.1 \
python-dateutil==2.8.1 \
--hash=sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c \
--hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a \
# via -r requirements/common.in, arrow, botocore, hypchat, moto
# via -r requirements/common.in, arrow, botocore, moto
python-debian==0.1.38 \
--hash=sha256:a1f89336d7675a56cdd92fa90cd8c00b9178dabcc6d3e08a397e80eca2b855f3 \
--hash=sha256:a352bb5f9ef19b0272078f516ee0ec42b05e90ac85651d87c10e7041550dcc1d \
@ -1043,7 +1040,7 @@ requests-oauthlib==1.3.0 \
requests[security]==2.25.0 \
--hash=sha256:7f1a0b932f4a60a1a65caa4263921bb7d9ee911957e0ae4a23a6dd08185ad5f8 \
--hash=sha256:e786fa28d8c9154e6a4de5d46a1d921b8749f8b74e28bde23768e5e16eece998 \
# via -r requirements/common.in, docker, hypchat, matrix-client, moto, premailer, pyoembed, python-digitalocean, python-gcm, python-twitter, requests-oauthlib, responses, semgrep, social-auth-core, sphinx, stripe, twilio, zulip
# via -r requirements/common.in, docker, matrix-client, moto, premailer, pyoembed, python-digitalocean, python-gcm, python-twitter, requests-oauthlib, responses, semgrep, social-auth-core, sphinx, stripe, twilio, zulip
responses==0.12.0 \
--hash=sha256:0de50fbf600adf5ef9f0821b85cc537acca98d66bc7776755924476775c1989c \
--hash=sha256:e80d5276011a4b79ecb62c5f82ba07aa23fb31ecbc95ee7cad6de250a3c97444 \
@ -1117,7 +1114,7 @@ sh==1.14.1 \
six==1.15.0 \
--hash=sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259 \
--hash=sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced \
# via argon2-cffi, automat, aws-sam-translator, cfn-lint, cryptography, django-bitfield, docker, ecdsa, hypchat, isodate, jsonschema, junit-xml, libthumbor, moto, openapi-core, openapi-schema-validator, openapi-spec-validator, parsel, pip-tools, protego, pyopenssl, python-binary-memcached, python-dateutil, python-debian, python-jose, qrcode, responses, social-auth-app-django, social-auth-core, talon, traitlets, twilio, w3lib, websocket-client, zulip
# via argon2-cffi, automat, aws-sam-translator, cfn-lint, cryptography, django-bitfield, docker, ecdsa, isodate, jsonschema, junit-xml, libthumbor, moto, openapi-core, openapi-schema-validator, openapi-spec-validator, parsel, pip-tools, protego, pyopenssl, python-binary-memcached, python-dateutil, python-debian, python-jose, qrcode, responses, social-auth-app-django, social-auth-core, talon, traitlets, twilio, w3lib, websocket-client, zulip
snakeviz==2.1.0 \
--hash=sha256:8ce375b18ae4a749516d7e6c6fbbf8be6177c53974f53534d8eadb646cd279b1 \
--hash=sha256:92ad876fb6a201a7e23a6b85ea96d9643a51e285667c253a8653643804f7cb68 \

View File

@ -243,9 +243,6 @@ httplib2==0.18.1 \
--hash=sha256:8af66c1c52c7ffe1aa5dc4bcd7c769885254b0756e6e69f953c7f0ab49a70ba3 \
--hash=sha256:ca2914b015b6247791c4866782fa6042f495b94401a0f0bd3e1d6e0ba2236782 \
# via -r requirements/common.in
hypchat==0.21 \
--hash=sha256:ef37a9cd8103bb13ad772b28ba9223ca9d4278371e374450c3ea2918df70a8e9 \
# via -r requirements/common.in
hyper==0.7.0 \
--hash=sha256:069514f54231fb7b5df2fb910a114663a83306d5296f588fffcb0a9be19407fc \
--hash=sha256:12c82eacd122a659673484c1ea0d34576430afbe5aa6b8f63fe37fcb06a2458c \
@ -646,7 +643,7 @@ python-binary-memcached==0.30.1 \
python-dateutil==2.8.1 \
--hash=sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c \
--hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a \
# via -r requirements/common.in, botocore, hypchat
# via -r requirements/common.in, botocore
python-gcm==0.4 \
--hash=sha256:511c35fc5ae829f7fc3cbdb45c4ec3fda02f85e4fae039864efe82682ccb9c18 \
# via -r requirements/common.in
@ -751,7 +748,7 @@ requests-oauthlib==1.3.0 \
requests[security]==2.25.0 \
--hash=sha256:7f1a0b932f4a60a1a65caa4263921bb7d9ee911957e0ae4a23a6dd08185ad5f8 \
--hash=sha256:e786fa28d8c9154e6a4de5d46a1d921b8749f8b74e28bde23768e5e16eece998 \
# via -r requirements/common.in, hypchat, matrix-client, premailer, pyoembed, python-gcm, python-twitter, requests-oauthlib, social-auth-core, stripe, twilio, zulip
# via -r requirements/common.in, matrix-client, premailer, pyoembed, python-gcm, python-twitter, requests-oauthlib, social-auth-core, stripe, twilio, zulip
s3transfer==0.3.3 \
--hash=sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13 \
--hash=sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db \
@ -763,7 +760,7 @@ sentry-sdk==0.19.4 \
six==1.15.0 \
--hash=sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259 \
--hash=sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced \
# via argon2-cffi, cryptography, django-bitfield, hypchat, isodate, jsonschema, libthumbor, openapi-core, openapi-schema-validator, openapi-spec-validator, pyopenssl, python-binary-memcached, python-dateutil, qrcode, social-auth-app-django, social-auth-core, talon, traitlets, twilio, zulip
# via argon2-cffi, cryptography, django-bitfield, isodate, jsonschema, libthumbor, openapi-core, openapi-schema-validator, openapi-spec-validator, pyopenssl, python-binary-memcached, python-dateutil, qrcode, social-auth-app-django, social-auth-core, talon, traitlets, twilio, zulip
social-auth-app-django==4.0.0 \
--hash=sha256:2c69e57df0b30c9c1823519c5f1992cbe4f3f98fdc7d95c840e091a752708840 \
--hash=sha256:567ad0e028311541d7dfed51d3bf2c60440a6fd236d5d4d06c5a618b3d6c57c5 \

View File

@ -243,8 +243,7 @@
<a class="feature-block" href="/help/import-from-slack" target="_blank" rel="noopener noreferrer">
<h3>DATA IMPORT</h3>
<p>
Import an existing Slack, Mattermost, HipChat, Stride,
or Gitter workspace into Zulip.
Import an existing Slack, Mattermost or Gitter workspace into Zulip.
</p>
</a>
<a class="feature-block" href="/help/add-custom-profile-fields" target="_blank" rel="noopener noreferrer">

View File

@ -7,7 +7,7 @@ message is about.
|---|---|---
| Zulip | Stream | Topic
| Email | Mailing list | Subject line
| Slack/IRC/HipChat | Channel/Room | -
| Slack/IRC | Channel/Room | -
Messages with the same stream and topic are shown together as a
conversational thread. Here is what it looks like in Zulip.

View File

@ -1,7 +1,7 @@
There are a lot of team chat apps. So why did we build Zulip?
We talk about Slack in the discussion below, but the problems apply equally
to other apps with Slacks conversation model, including HipChat, IRC,
to other apps with Slacks conversation model, including IRC,
Mattermost, Discord, Spark, and others.
## Reading busy Slack channels is extremely inefficient.

View File

@ -120,7 +120,6 @@ not_yet_fully_covered = [
'zerver/tornado/sharding.py',
'zerver/tornado/views.py',
# Data import files; relatively low priority
'zerver/data_import/hipchat*.py',
'zerver/data_import/sequencer.py',
'zerver/data_import/slack.py',
'zerver/data_import/gitter.py',

View File

@ -43,4 +43,4 @@ API_FEATURE_LEVEL = 36
# historical commits sharing the same major version, in which case a
# minor version bump suffices.
PROVISION_VERSION = '119.0'
PROVISION_VERSION = '120.0'

View File

@ -1,882 +0,0 @@
import base64
import glob
import logging
import os
import re
import shutil
import subprocess
from typing import Any, Callable, Dict, List, Optional, Set
import dateutil
import hypchat
import orjson
from django.conf import settings
from django.utils.timezone import now as timezone_now
from zerver.data_import.hipchat_attachment import AttachmentHandler
from zerver.data_import.hipchat_user import UserHandler
from zerver.data_import.import_util import (
SubscriberHandler,
build_message,
build_personal_subscriptions,
build_public_stream_subscriptions,
build_realm,
build_realm_emoji,
build_recipients,
build_stream,
build_stream_subscriptions,
build_user_profile,
build_zerver_realm,
create_converted_data_files,
make_subscriber_map,
make_user_messages,
write_avatar_png,
)
from zerver.data_import.sequencer import NEXT_ID, IdMapper
from zerver.lib.utils import process_list_in_batches
from zerver.models import RealmEmoji, Recipient, UserProfile
# stubs
ZerverFieldsT = Dict[str, Any]
def str_date_to_float(date_str: str) -> float:
'''
Dates look like this:
"2018-08-08T14:23:54Z 626267"
'''
parts = date_str.split(' ')
time_str = parts[0].replace('T', ' ')
date_time = dateutil.parser.parse(time_str)
timestamp = date_time.timestamp()
if len(parts) == 2:
microseconds = int(parts[1])
timestamp += microseconds / 1000000.0
return timestamp
def untar_input_file(tar_file: str) -> str:
data_dir = tar_file.replace('.tar', '')
data_dir = os.path.abspath(data_dir)
if os.path.exists(data_dir):
logging.info('input data was already untarred to %s, we will use it', data_dir)
return data_dir
os.makedirs(data_dir)
subprocess.check_call(['tar', '-xf', tar_file, '-C', data_dir])
logging.info('input data was untarred to %s', data_dir)
return data_dir
def read_user_data(data_dir: str) -> List[ZerverFieldsT]:
fn = 'users.json'
data_file = os.path.join(data_dir, fn)
with open(data_file, "rb") as fp:
return orjson.loads(fp.read())
def convert_user_data(user_handler: UserHandler,
slim_mode: bool,
user_id_mapper: IdMapper,
raw_data: List[ZerverFieldsT],
realm_id: int) -> None:
flat_data = [
d['User']
for d in raw_data
]
def process(in_dict: ZerverFieldsT) -> ZerverFieldsT:
delivery_email = in_dict['email']
email = in_dict['email']
full_name = in_dict['name']
id = user_id_mapper.get(in_dict['id'])
is_mirror_dummy = False
short_name = in_dict['mention_name']
timezone = in_dict['timezone']
role = UserProfile.ROLE_MEMBER
if in_dict['account_type'] == 'admin':
role = UserProfile.ROLE_REALM_ADMINISTRATOR
if in_dict['account_type'] == 'guest':
role = UserProfile.ROLE_GUEST
date_joined = int(timezone_now().timestamp())
is_active = not in_dict['is_deleted']
if not email:
if role == UserProfile.ROLE_GUEST:
# HipChat guest users don't have emails, so
# we just fake them.
email = f'guest-{id}@example.com'
delivery_email = email
else:
# HipChat sometimes doesn't export an email for deactivated users.
assert not is_active
email = delivery_email = f"deactivated-{id}@example.com"
# unmapped fields:
# title - Developer, Project Manager, etc.
# rooms - no good sample data
# created - we just use "now"
# roles - we just use account_type
if in_dict.get('avatar'):
avatar_source = 'U'
else:
avatar_source = 'G'
return build_user_profile(
avatar_source=avatar_source,
date_joined=date_joined,
delivery_email=delivery_email,
email=email,
full_name=full_name,
id=id,
is_active=is_active,
role=role,
is_mirror_dummy=is_mirror_dummy,
realm_id=realm_id,
short_name=short_name,
timezone=timezone,
)
for raw_item in flat_data:
user = process(raw_item)
user_handler.add_user(user)
def convert_avatar_data(avatar_folder: str,
raw_data: List[ZerverFieldsT],
user_id_mapper: IdMapper,
realm_id: int) -> List[ZerverFieldsT]:
'''
This code is pretty specific to how HipChat sends us data.
They give us the avatar payloads in base64 in users.json.
We process avatars in our own pass of that data, rather
than doing it while we're getting other user data. I
chose to keep this separate, as otherwise you have a lot
of extraneous data getting passed around.
This code has MAJOR SIDE EFFECTS--namely writing a bunch
of files to the avatars directory.
'''
avatar_records = []
for d in raw_data:
raw_user = d['User']
avatar_payload = raw_user.get('avatar')
if not avatar_payload:
continue
bits = base64.b64decode(avatar_payload)
raw_user_id = raw_user['id']
if not user_id_mapper.has(raw_user_id):
continue
user_id = user_id_mapper.get(raw_user_id)
metadata = write_avatar_png(
avatar_folder=avatar_folder,
realm_id=realm_id,
user_id=user_id,
bits=bits,
)
avatar_records.append(metadata)
return avatar_records
def read_room_data(data_dir: str) -> List[ZerverFieldsT]:
fn = 'rooms.json'
data_file = os.path.join(data_dir, fn)
with open(data_file, "rb") as f:
data = orjson.loads(f.read())
return data
def convert_room_data(raw_data: List[ZerverFieldsT],
subscriber_handler: SubscriberHandler,
stream_id_mapper: IdMapper,
user_id_mapper: IdMapper,
realm_id: int,
api_token: Optional[str]=None) -> List[ZerverFieldsT]:
flat_data = [
d['Room']
for d in raw_data
]
def get_invite_only(v: str) -> bool:
if v == 'public':
return False
elif v == 'private':
return True
else:
raise Exception('unexpected value')
streams = []
for in_dict in flat_data:
now = int(timezone_now().timestamp())
stream_id = stream_id_mapper.get(in_dict['id'])
invite_only = get_invite_only(in_dict['privacy'])
stream = build_stream(
date_created=now,
realm_id=realm_id,
name=in_dict['name'],
description=in_dict['topic'],
stream_id=stream_id,
deactivated=in_dict['is_archived'],
invite_only=invite_only,
)
if invite_only:
users: Set[int] = {
user_id_mapper.get(key)
for key in in_dict['members']
if user_id_mapper.has(key)
}
if user_id_mapper.has(in_dict['owner']):
owner = user_id_mapper.get(in_dict['owner'])
users.add(owner)
else:
users = set()
if api_token is not None:
hc = hypchat.HypChat(api_token)
room_data = hc.fromurl('{}/v2/room/{}/member'.format(hc.endpoint, in_dict['id']))
for item in room_data['items']:
hipchat_user_id = item['id']
zulip_user_id = user_id_mapper.get(hipchat_user_id)
users.add(zulip_user_id)
if users:
subscriber_handler.set_info(
stream_id=stream_id,
users=users,
)
# unmapped fields:
# guest_access_url: no Zulip equivalent
# created: we just use "now"
# participants: no good sample data
streams.append(stream)
return streams
def make_realm(realm_id: int) -> ZerverFieldsT:
NOW = float(timezone_now().timestamp())
domain_name = settings.EXTERNAL_HOST
realm_subdomain = ""
zerver_realm = build_zerver_realm(realm_id, realm_subdomain, NOW, 'HipChat')
realm = build_realm(zerver_realm, realm_id, domain_name)
# We may override these later.
realm['zerver_defaultstream'] = []
return realm
def write_avatar_data(raw_user_data: List[ZerverFieldsT],
output_dir: str,
user_id_mapper: IdMapper,
realm_id: int) -> None:
avatar_folder = os.path.join(output_dir, 'avatars')
avatar_realm_folder = os.path.join(avatar_folder, str(realm_id))
os.makedirs(avatar_realm_folder, exist_ok=True)
avatar_records = convert_avatar_data(
avatar_folder=avatar_folder,
raw_data=raw_user_data,
user_id_mapper=user_id_mapper,
realm_id=realm_id,
)
create_converted_data_files(avatar_records, output_dir, '/avatars/records.json')
def write_emoticon_data(realm_id: int,
data_dir: str,
output_dir: str) -> List[ZerverFieldsT]:
'''
This function does most of the work for processing emoticons, the bulk
of which is copying files. We also write a json file with metadata.
Finally, we return a list of RealmEmoji dicts to our caller.
In our data_dir we have a pretty simple setup:
emoticons.json - has very simple metadata on emojis:
{
"Emoticon": {
"id": 9875487,
"path": "emoticons/yasss.jpg",
"shortcut": "yasss"
}
},
{
"Emoticon": {
"id": 718017,
"path": "emoticons/yayyyyy.gif",
"shortcut": "yayyyyy"
}
}
emoticons/ - contains a bunch of image files:
slytherinsnake.gif
spanishinquisition.jpg
sparkle.png
spiderman.gif
stableparrot.gif
stalkerparrot.gif
supergirl.png
superman.png
We move all the relevant files to Zulip's more nested
directory structure.
'''
logging.info('Starting to process emoticons')
fn = 'emoticons.json'
data_file = os.path.join(data_dir, fn)
if not os.path.exists(data_file):
logging.warning("HipChat export does not contain emoticons.json.")
logging.warning("As a result, custom emoji cannot be imported.")
return []
with open(data_file, "rb") as f:
data = orjson.loads(f.read())
if isinstance(data, dict) and 'Emoticons' in data:
# Handle the hc-migrate export format for emoticons.json.
flat_data = [
dict(
path=d['path'],
name=d['shortcut'],
)
for d in data['Emoticons']
]
else:
flat_data = [
dict(
path=d['Emoticon']['path'],
name=d['Emoticon']['shortcut'],
)
for d in data
]
emoji_folder = os.path.join(output_dir, 'emoji')
os.makedirs(emoji_folder, exist_ok=True)
def process(data: ZerverFieldsT) -> ZerverFieldsT:
source_sub_path = data['path']
source_fn = os.path.basename(source_sub_path)
source_path = os.path.join(data_dir, source_sub_path)
# Use our template from RealmEmoji
# PATH_ID_TEMPLATE = "{realm_id}/emoji/images/{emoji_file_name}"
target_fn = source_fn
target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format(
realm_id=realm_id,
emoji_file_name=target_fn,
)
target_path = os.path.join(emoji_folder, target_sub_path)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
source_path = os.path.abspath(source_path)
target_path = os.path.abspath(target_path)
shutil.copyfile(source_path, target_path)
return dict(
path=target_path,
s3_path=target_path,
file_name=target_fn,
realm_id=realm_id,
name=data['name'],
)
emoji_records = list(map(process, flat_data))
create_converted_data_files(emoji_records, output_dir, '/emoji/records.json')
realmemoji = [
build_realm_emoji(
realm_id=realm_id,
name=rec['name'],
id=NEXT_ID('realmemoji'),
file_name=rec['file_name'],
)
for rec in emoji_records
]
logging.info('Done processing emoticons')
return realmemoji
def write_message_data(realm_id: int,
slim_mode: bool,
message_key: str,
zerver_recipient: List[ZerverFieldsT],
subscriber_map: Dict[int, Set[int]],
data_dir: str,
output_dir: str,
masking_content: bool,
stream_id_mapper: IdMapper,
user_id_mapper: IdMapper,
user_handler: UserHandler,
attachment_handler: AttachmentHandler) -> None:
stream_id_to_recipient_id = {
d['type_id']: d['id']
for d in zerver_recipient
if d['type'] == Recipient.STREAM
}
user_id_to_recipient_id = {
d['type_id']: d['id']
for d in zerver_recipient
if d['type'] == Recipient.PERSONAL
}
def get_stream_recipient_id(raw_message: ZerverFieldsT) -> int:
fn_id = raw_message['fn_id']
stream_id = stream_id_mapper.get(fn_id)
recipient_id = stream_id_to_recipient_id[stream_id]
return recipient_id
def get_pm_recipient_id(raw_message: ZerverFieldsT) -> int:
raw_user_id = raw_message['receiver_id']
assert(raw_user_id)
user_id = user_id_mapper.get(raw_user_id)
recipient_id = user_id_to_recipient_id[user_id]
return recipient_id
if message_key in ['UserMessage', 'NotificationMessage']:
is_pm_data = False
dir_glob = os.path.join(data_dir, 'rooms', '*', 'history.json')
get_recipient_id = get_stream_recipient_id
get_files_dir = lambda fn_id: os.path.join(data_dir, 'rooms', str(fn_id), 'files')
elif message_key == 'PrivateUserMessage':
is_pm_data = True
dir_glob = os.path.join(data_dir, 'users', '*', 'history.json')
get_recipient_id = get_pm_recipient_id
get_files_dir = lambda fn_id: os.path.join(data_dir, 'users', 'files')
else:
raise Exception('programming error: invalid message_key: ' + message_key)
history_files = glob.glob(dir_glob)
for fn in history_files:
dir = os.path.dirname(fn)
fn_id = os.path.basename(dir)
files_dir = get_files_dir(fn_id)
process_message_file(
realm_id=realm_id,
slim_mode=slim_mode,
fn=fn,
fn_id=fn_id,
files_dir=files_dir,
get_recipient_id=get_recipient_id,
message_key=message_key,
subscriber_map=subscriber_map,
data_dir=data_dir,
output_dir=output_dir,
is_pm_data=is_pm_data,
masking_content=masking_content,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
attachment_handler=attachment_handler,
)
def get_hipchat_sender_id(realm_id: int,
slim_mode: bool,
message_dict: Dict[str, Any],
user_id_mapper: IdMapper,
user_handler: UserHandler) -> Optional[int]:
'''
The HipChat export is inconsistent in how it renders
senders, and sometimes we don't even get an id.
'''
if isinstance(message_dict['sender'], str):
if slim_mode:
return None
# Some HipChat instances just give us a person's
# name in the sender field for NotificationMessage.
# We turn them into a mirror user.
mirror_user = user_handler.get_mirror_user(
realm_id=realm_id,
name=message_dict['sender'],
)
sender_id = mirror_user['id']
return sender_id
raw_sender_id = message_dict['sender']['id']
if raw_sender_id == 0:
if slim_mode:
return None
mirror_user = user_handler.get_mirror_user(
realm_id=realm_id,
name=message_dict['sender']['name'],
)
sender_id = mirror_user['id']
return sender_id
if not user_id_mapper.has(raw_sender_id):
if slim_mode:
return None
mirror_user = user_handler.get_mirror_user(
realm_id=realm_id,
name=message_dict['sender']['id'],
)
sender_id = mirror_user['id']
return sender_id
# HAPPY PATH: HipChat just gave us an ordinary
# sender_id.
sender_id = user_id_mapper.get(raw_sender_id)
return sender_id
def process_message_file(realm_id: int,
slim_mode: bool,
fn: str,
fn_id: str,
files_dir: str,
get_recipient_id: Callable[[ZerverFieldsT], int],
message_key: str,
subscriber_map: Dict[int, Set[int]],
data_dir: str,
output_dir: str,
is_pm_data: bool,
masking_content: bool,
user_id_mapper: IdMapper,
user_handler: UserHandler,
attachment_handler: AttachmentHandler) -> None:
def get_raw_messages(fn: str) -> List[ZerverFieldsT]:
with open(fn, "rb") as f:
data = orjson.loads(f.read())
flat_data = [
d[message_key]
for d in data
if message_key in d
]
def get_raw_message(d: Dict[str, Any]) -> Optional[ZerverFieldsT]:
sender_id = get_hipchat_sender_id(
realm_id=realm_id,
slim_mode=slim_mode,
message_dict=d,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
)
if sender_id is None:
return None
if is_pm_data:
# We need to compare with str() on both sides here.
# In Stride, user IDs are strings, but in HipChat,
# they are integers, and fn_id is always a string.
if str(sender_id) != str(fn_id):
# PMs are in multiple places in the HipChat export,
# and we only use the copy from the sender
return None
content = d['message']
if masking_content:
content = re.sub('[a-z]', 'x', content)
content = re.sub('[A-Z]', 'X', content)
return dict(
fn_id=fn_id,
sender_id=sender_id,
receiver_id=d.get('receiver', {}).get('id'),
content=content,
mention_user_ids=d.get('mentions', []),
date_sent=str_date_to_float(d['timestamp']),
attachment=d.get('attachment'),
files_dir=files_dir,
)
raw_messages = []
for d in flat_data:
raw_message = get_raw_message(d)
if raw_message is not None:
raw_messages.append(raw_message)
return raw_messages
raw_messages = get_raw_messages(fn)
def process_batch(lst: List[Any]) -> None:
process_raw_message_batch(
realm_id=realm_id,
raw_messages=lst,
subscriber_map=subscriber_map,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
attachment_handler=attachment_handler,
get_recipient_id=get_recipient_id,
is_pm_data=is_pm_data,
output_dir=output_dir,
)
chunk_size = 1000
process_list_in_batches(
lst=raw_messages,
chunk_size=chunk_size,
process_batch=process_batch,
)
def process_raw_message_batch(realm_id: int,
raw_messages: List[Dict[str, Any]],
subscriber_map: Dict[int, Set[int]],
user_id_mapper: IdMapper,
user_handler: UserHandler,
attachment_handler: AttachmentHandler,
get_recipient_id: Callable[[ZerverFieldsT], int],
is_pm_data: bool,
output_dir: str) -> None:
def fix_mentions(content: str,
mention_user_ids: Set[int]) -> str:
for user_id in mention_user_ids:
user = user_handler.get_user(user_id=user_id)
hipchat_mention = '@{short_name}'.format(**user)
zulip_mention = '@**{full_name}**'.format(**user)
content = content.replace(hipchat_mention, zulip_mention)
content = content.replace('@here', '@**all**')
return content
mention_map: Dict[int, Set[int]] = {}
zerver_message = []
import html2text
h = html2text.HTML2Text()
for raw_message in raw_messages:
# One side effect here:
message_id = NEXT_ID('message')
mention_user_ids = {
user_id_mapper.get(id)
for id in set(raw_message['mention_user_ids'])
if user_id_mapper.has(id)
}
mention_map[message_id] = mention_user_ids
content = fix_mentions(
content=raw_message['content'],
mention_user_ids=mention_user_ids,
)
content = h.handle(content)
if len(content) > 10000:
logging.info('skipping too-long message of length %s', len(content))
continue
date_sent = raw_message['date_sent']
try:
recipient_id = get_recipient_id(raw_message)
except KeyError:
logging.debug("Could not find recipient_id for a message, skipping.")
continue
rendered_content = None
if is_pm_data:
topic_name = ''
else:
topic_name = 'imported from HipChat'
user_id = raw_message['sender_id']
# Another side effect:
extra_content = attachment_handler.handle_message_data(
realm_id=realm_id,
message_id=message_id,
sender_id=user_id,
attachment=raw_message['attachment'],
files_dir=raw_message['files_dir'],
)
if extra_content:
has_attachment = True
content += '\n' + extra_content
else:
has_attachment = False
message = build_message(
content=content,
message_id=message_id,
date_sent=date_sent,
recipient_id=recipient_id,
rendered_content=rendered_content,
topic_name=topic_name,
user_id=user_id,
has_attachment=has_attachment,
)
zerver_message.append(message)
zerver_usermessage = make_user_messages(
zerver_message=zerver_message,
subscriber_map=subscriber_map,
is_pm_data=is_pm_data,
mention_map=mention_map,
)
message_json = dict(
zerver_message=zerver_message,
zerver_usermessage=zerver_usermessage,
)
dump_file_id = NEXT_ID('dump_file_id')
message_file = f"/messages-{dump_file_id:06}.json"
create_converted_data_files(message_json, output_dir, message_file)
def do_convert_data(input_tar_file: str,
output_dir: str,
masking_content: bool,
api_token: Optional[str]=None,
slim_mode: bool=False) -> None:
input_data_dir = untar_input_file(input_tar_file)
attachment_handler = AttachmentHandler()
user_handler = UserHandler()
subscriber_handler = SubscriberHandler()
user_id_mapper = IdMapper()
stream_id_mapper = IdMapper()
realm_id = 0
realm = make_realm(realm_id=realm_id)
# users.json -> UserProfile
raw_user_data = read_user_data(data_dir=input_data_dir)
convert_user_data(
user_handler=user_handler,
slim_mode=slim_mode,
user_id_mapper=user_id_mapper,
raw_data=raw_user_data,
realm_id=realm_id,
)
normal_users = user_handler.get_normal_users()
# Don't write zerver_userprofile here, because we
# may add more users later.
# streams.json -> Stream
raw_stream_data = read_room_data(data_dir=input_data_dir)
zerver_stream = convert_room_data(
raw_data=raw_stream_data,
subscriber_handler=subscriber_handler,
stream_id_mapper=stream_id_mapper,
user_id_mapper=user_id_mapper,
realm_id=realm_id,
api_token=api_token,
)
realm['zerver_stream'] = zerver_stream
zerver_recipient = build_recipients(
zerver_userprofile=normal_users,
zerver_stream=zerver_stream,
)
realm['zerver_recipient'] = zerver_recipient
if api_token is None:
if slim_mode:
public_stream_subscriptions: List[ZerverFieldsT] = []
else:
public_stream_subscriptions = build_public_stream_subscriptions(
zerver_userprofile=normal_users,
zerver_recipient=zerver_recipient,
zerver_stream=zerver_stream,
)
private_stream_subscriptions = build_stream_subscriptions(
get_users=subscriber_handler.get_users,
zerver_recipient=zerver_recipient,
zerver_stream=[stream_dict for stream_dict in zerver_stream
if stream_dict['invite_only']],
)
stream_subscriptions = public_stream_subscriptions + private_stream_subscriptions
else:
stream_subscriptions = build_stream_subscriptions(
get_users=subscriber_handler.get_users,
zerver_recipient=zerver_recipient,
zerver_stream=zerver_stream,
)
personal_subscriptions = build_personal_subscriptions(
zerver_recipient=zerver_recipient,
)
zerver_subscription = personal_subscriptions + stream_subscriptions
realm['zerver_subscription'] = zerver_subscription
zerver_realmemoji = write_emoticon_data(
realm_id=realm_id,
data_dir=input_data_dir,
output_dir=output_dir,
)
realm['zerver_realmemoji'] = zerver_realmemoji
subscriber_map = make_subscriber_map(
zerver_subscription=zerver_subscription,
)
logging.info('Start importing message data')
for message_key in ['UserMessage',
'NotificationMessage',
'PrivateUserMessage']:
write_message_data(
realm_id=realm_id,
slim_mode=slim_mode,
message_key=message_key,
zerver_recipient=zerver_recipient,
subscriber_map=subscriber_map,
data_dir=input_data_dir,
output_dir=output_dir,
masking_content=masking_content,
stream_id_mapper=stream_id_mapper,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
attachment_handler=attachment_handler,
)
# Order is important here...don't write users until
# we process everything else, since we may introduce
# mirror users when processing messages.
realm['zerver_userprofile'] = user_handler.get_all_users()
realm['sort_by_date'] = True
create_converted_data_files(realm, output_dir, '/realm.json')
logging.info('Start importing avatar data')
write_avatar_data(
raw_user_data=raw_user_data,
output_dir=output_dir,
user_id_mapper=user_id_mapper,
realm_id=realm_id,
)
attachment_handler.write_info(
output_dir=output_dir,
realm_id=realm_id,
)
logging.info('Start making tarball')
subprocess.check_call(["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P'])
logging.info('Done making tarball')

View File

@ -1,136 +0,0 @@
import logging
import os
import shutil
from typing import Any, Dict, List, Optional
from zerver.data_import.import_util import build_attachment, create_converted_data_files
class AttachmentHandler:
def __init__(self) -> None:
self.info_dict: Dict[str, Dict[str, Any]] = {}
def handle_message_data(self,
realm_id: int,
message_id: int,
sender_id: int,
attachment: Dict[str, Any],
files_dir: str) -> Optional[str]:
if not attachment:
return None
name = attachment['name']
if 'path' not in attachment:
logging.info('Skipping HipChat attachment with missing path data: ' + name)
return None
size = attachment['size']
path = attachment['path']
local_fn = os.path.join(files_dir, path)
if not os.path.exists(local_fn):
# HipChat has an option to not include these in its
# exports, since file uploads can be very large.
logging.info('Skipping attachment with no file data: ' + local_fn)
return None
target_path = os.path.join(
str(realm_id),
'HipChatImportAttachment',
path,
)
if target_path in self.info_dict:
logging.info("file used multiple times: " + path)
info = self.info_dict[target_path]
info['message_ids'].add(message_id)
return info['content']
# HipChat provides size info, but it's not
# completely trustworthy, so we we just
# ask the OS for file details.
size = os.path.getsize(local_fn)
mtime = os.path.getmtime(local_fn)
content = f'[{name}](/user_uploads/{target_path})'
info = dict(
message_ids={message_id},
sender_id=sender_id,
local_fn=local_fn,
target_path=target_path,
name=name,
size=size,
mtime=mtime,
content=content,
)
self.info_dict[target_path] = info
return content
def write_info(self, output_dir: str, realm_id: int) -> None:
attachments: List[Dict[str, Any]] = []
uploads_records: List[Dict[str, Any]] = []
def add_attachment(info: Dict[str, Any]) -> None:
build_attachment(
realm_id=realm_id,
message_ids=info['message_ids'],
user_id=info['sender_id'],
fileinfo=dict(
created=info['mtime'], # minor lie
size=info['size'],
name=info['name'],
),
s3_path=info['target_path'],
zerver_attachment=attachments,
)
def add_upload(info: Dict[str, Any]) -> None:
target_path = info['target_path']
upload_rec = dict(
size=info['size'],
user_profile_id=info['sender_id'],
realm_id=realm_id,
s3_path=target_path,
path=target_path,
content_type=None,
)
uploads_records.append(upload_rec)
def make_full_target_path(info: Dict[str, Any]) -> str:
target_path = info['target_path']
full_target_path = os.path.join(
output_dir,
'uploads',
target_path,
)
full_target_path = os.path.abspath(full_target_path)
os.makedirs(os.path.dirname(full_target_path), exist_ok=True)
return full_target_path
def copy_file(info: Dict[str, Any]) -> None:
source_path = info['local_fn']
target_path = make_full_target_path(info)
shutil.copyfile(source_path, target_path)
logging.info('Start processing attachment files')
for info in self.info_dict.values():
add_attachment(info)
add_upload(info)
copy_file(info)
uploads_folder = os.path.join(output_dir, 'uploads')
os.makedirs(os.path.join(uploads_folder, str(realm_id)), exist_ok=True)
attachment = dict(
zerver_attachment=attachments,
)
create_converted_data_files(uploads_records, output_dir, '/uploads/records.json')
create_converted_data_files(attachment, output_dir, '/attachment.json')
logging.info('Done processing attachment files')

View File

@ -1,84 +0,0 @@
from typing import Any, Dict, List
from django.utils.timezone import now as timezone_now
from zerver.data_import.import_util import build_user_profile
from zerver.models import UserProfile
class UserHandler:
'''
Our UserHandler class is a glorified wrapper
around the data that eventually goes into
zerver_userprofile.
The class helps us do things like map ids
to names for mentions.
We also sometimes need to build mirror
users on the fly.
'''
def __init__(self) -> None:
self.id_to_user_map: Dict[int, Dict[str, Any]] = {}
self.name_to_mirror_user_map: Dict[str, Dict[str, Any]] = {}
self.mirror_user_id = 1
def add_user(self, user: Dict[str, Any]) -> None:
user_id = user['id']
self.id_to_user_map[user_id] = user
def get_user(self, user_id: int) -> Dict[str, Any]:
user = self.id_to_user_map[user_id]
return user
def get_mirror_user(self,
realm_id: int,
name: str) -> Dict[str, Any]:
if name in self.name_to_mirror_user_map:
user = self.name_to_mirror_user_map[name]
return user
user_id = self._new_mirror_user_id()
short_name = name
full_name = name
email = f'mirror-{user_id}@example.com'
delivery_email = email
avatar_source = 'G'
date_joined = int(timezone_now().timestamp())
timezone = 'UTC'
user = build_user_profile(
avatar_source=avatar_source,
date_joined=date_joined,
delivery_email=delivery_email,
email=email,
full_name=full_name,
id=user_id,
is_active=False,
role=UserProfile.ROLE_MEMBER,
is_mirror_dummy=True,
realm_id=realm_id,
short_name=short_name,
timezone=timezone,
)
self.name_to_mirror_user_map[name] = user
return user
def _new_mirror_user_id(self) -> int:
next_id = self.mirror_user_id
while next_id in self.id_to_user_map:
next_id += 1
self.mirror_user_id = next_id + 1
return next_id
def get_normal_users(self) -> List[Dict[str, Any]]:
users = list(self.id_to_user_map.values())
return users
def get_all_users(self) -> List[Dict[str, Any]]:
normal_users = self.get_normal_users()
mirror_users = list(self.name_to_mirror_user_map.values())
all_users = normal_users + mirror_users
return all_users

View File

@ -170,8 +170,8 @@ def build_public_stream_subscriptions(
zerver_recipient: List[ZerverFieldsT],
zerver_stream: List[ZerverFieldsT]) -> List[ZerverFieldsT]:
'''
This function is only used for HipChat now, but it may apply to
future conversions. We often don't get full subscriber data in
This function was only used for HipChat, but it may apply to
future conversions. We often did't get full subscriber data in
the HipChat export, so this function just autosubscribes all
users to every public stream. This returns a list of Subscription
dicts.
@ -298,8 +298,8 @@ def build_recipients(zerver_userprofile: Iterable[ZerverFieldsT],
zerver_stream: Iterable[ZerverFieldsT],
zerver_huddle: Iterable[ZerverFieldsT] = []) -> List[ZerverFieldsT]:
'''
As of this writing, we only use this in the HipChat
conversion. The Slack and Gitter conversions do it more
This function was only used HipChat import, this function may be
required for future conversions. The Slack and Gitter conversions do it more
tightly integrated with creating other objects.
'''

View File

@ -8,7 +8,7 @@ sequences work.
You need to be a bit careful here, since
you're dealing with a big singleton, but
for data imports that's usually easy to
manage. See hipchat.py for example usage.
manage.
'''
def _seq() -> Callable[[], int]:

View File

@ -52,10 +52,6 @@ TAB_DISPLAY_NAMES = {
'desktop': 'Desktop',
'mobile': 'Mobile',
'cloud': 'HipChat Cloud',
'server': 'HipChat Server or Data Center',
'stride': 'Stride',
'mm-default': 'Default installation',
'mm-docker': 'Docker',
'mm-gitlab-omnibus': 'GitLab Omnibus',

View File

@ -1,81 +0,0 @@
import argparse
import os
from typing import Any
'''
Example usage for testing purposes:
Move the data:
rm -Rf ~/hipchat-data
mkdir ~/hipchat-data
./manage.py convert_hipchat_data ~/hipchat-31028-2018-08-08_23-23-22.tar --output ~/hipchat-data
./manage.py import --destroy-rebuild-database hipchat ~/hipchat-data
Test out the realm:
./tools/run-dev.py
go to browser and use your dev url
spec:
https://confluence.atlassian.com/hipchatkb/
exporting-from-hipchat-server-or-data-center-for-data-portability-950821555.html
'''
from django.core.management.base import BaseCommand, CommandError, CommandParser
from zerver.data_import.hipchat import do_convert_data
class Command(BaseCommand):
help = """Convert the HipChat data into Zulip data format."""
def add_arguments(self, parser: CommandParser) -> None:
parser.add_argument('hipchat_tar', nargs='+',
metavar='<hipchat data tarfile>',
help="tar of HipChat data")
parser.add_argument('--output', dest='output_dir',
help='Directory to write exported data to.')
parser.add_argument('--mask', dest='masking_content',
action="store_true",
help='Mask the content for privacy during QA.')
parser.add_argument('--slim-mode',
action="store_true",
help="Default to no public stream subscriptions if no token is available." +
" See import docs for details.")
parser.add_argument('--token', dest='api_token',
help='API token for the HipChat API for fetching subscribers.')
parser.formatter_class = argparse.RawTextHelpFormatter
def handle(self, *args: Any, **options: Any) -> None:
output_dir = options["output_dir"]
if output_dir is None:
raise CommandError("You need to specify --output <output directory>")
if os.path.exists(output_dir) and not os.path.isdir(output_dir):
raise CommandError(output_dir + " is not a directory")
os.makedirs(output_dir, exist_ok=True)
if os.listdir(output_dir):
raise CommandError('Output directory should be empty!')
output_dir = os.path.realpath(output_dir)
for path in options['hipchat_tar']:
if not os.path.exists(path):
raise CommandError(f"Tar file not found: '{path}'")
print("Converting data ...")
do_convert_data(
input_tar_file=path,
output_dir=output_dir,
masking_content=options.get('masking_content', False),
slim_mode=options['slim_mode'],
api_token=options.get("api_token"),
)

View File

@ -1,76 +0,0 @@
from typing import Any, Dict
from zerver.data_import.hipchat import get_hipchat_sender_id
from zerver.data_import.hipchat_user import UserHandler
from zerver.data_import.sequencer import IdMapper
from zerver.lib.test_classes import ZulipTestCase
class HipChatImporter(ZulipTestCase):
def test_sender_ids(self) -> None:
realm_id = 5
user_handler = UserHandler()
user_id_mapper = IdMapper()
self.assertEqual(user_id_mapper.get(1), 1)
# Simulate a "normal" user first.
user_with_id = dict(
id=1,
# other fields don't matter here
)
user_handler.add_user(user=user_with_id)
normal_message: Dict[str, Any] = dict(
sender=dict(
id=1,
),
)
sender_id = get_hipchat_sender_id(
realm_id=realm_id,
slim_mode=False,
message_dict=normal_message,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
)
self.assertEqual(sender_id, 1)
bot_message = dict(
sender='fred_bot',
)
# Every message from fred_bot should
# return the same sender_id.
fred_bot_sender_id = 2
for i in range(3):
sender_id = get_hipchat_sender_id(
realm_id=realm_id,
slim_mode=False,
message_dict=bot_message,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
)
self.assertEqual(sender_id, fred_bot_sender_id)
id_zero_message = dict(
sender=dict(
id=0,
name='hal_bot',
),
)
hal_bot_sender_id = 3
for i in range(3):
sender_id = get_hipchat_sender_id(
realm_id=realm_id,
slim_mode=False,
message_dict=id_zero_message,
user_id_mapper=user_id_mapper,
user_handler=user_handler,
)
self.assertEqual(sender_id, hal_bot_sender_id)

View File

@ -51,7 +51,7 @@ def api_teamcity_webhook(request: HttpRequest, user_profile: UserProfile,
payload: Dict[str, Any]=REQ(argument_type='body')) -> HttpResponse:
message = payload.get('build')
if message is None:
# Ignore third-party specific (e.g. Slack/HipChat) payload formats
# Ignore third-party specific (e.g. Slack) payload formats
# and notify the bot owner
message = MISCONFIGURED_PAYLOAD_TYPE_ERROR_MESSAGE.format(
bot_name=user_profile.full_name,