zulip/zerver/data_import/hipchat_attachment.py

import logging
import os
import shutil
from typing import Any, Dict, List, Optional

from zerver.data_import.import_util import build_attachment, create_converted_data_files


class AttachmentHandler:
    def __init__(self) -> None:
        self.info_dict: Dict[str, Dict[str, Any]] = {}

    def handle_message_data(self,
                            realm_id: int,
                            message_id: int,
                            sender_id: int,
                            attachment: Dict[str, Any],
                            files_dir: str) -> Optional[str]:
        if not attachment:
            return None

        name = attachment['name']

        if 'path' not in attachment:
            logging.info('Skipping HipChat attachment with missing path data: ' + name)
            return None

        size = attachment['size']
        path = attachment['path']

        local_fn = os.path.join(files_dir, path)

        if not os.path.exists(local_fn):
            # HipChat has an option to not include these in its
            # exports, since file uploads can be very large.
            logging.info('Skipping attachment with no file data: ' + local_fn)
            return None

        target_path = os.path.join(
            str(realm_id),
            'HipChatImportAttachment',
            path,
        )

        if target_path in self.info_dict:
            logging.info("file used multiple times: " + path)
            info = self.info_dict[target_path]
            info['message_ids'].add(message_id)
            return info['content']

        # HipChat provides size info, but it's not
        # completely trustworthy, so we we just
        # ask the OS for file details.
        size = os.path.getsize(local_fn)
        mtime = os.path.getmtime(local_fn)

        content = f'[{name}](/user_uploads/{target_path})'

        info = dict(
            message_ids={message_id},
            sender_id=sender_id,
            local_fn=local_fn,
            target_path=target_path,
            name=name,
            size=size,
            mtime=mtime,
            content=content,
        )
        self.info_dict[target_path] = info

        return content

    def write_info(self, output_dir: str, realm_id: int) -> None:
        attachments: List[Dict[str, Any]] = []
        uploads_records: List[Dict[str, Any]] = []

        def add_attachment(info: Dict[str, Any]) -> None:
            build_attachment(
                realm_id=realm_id,
                message_ids=info['message_ids'],
                user_id=info['sender_id'],
                fileinfo=dict(
                    created=info['mtime'],  # minor lie
                    size=info['size'],
                    name=info['name'],
                ),
                s3_path=info['target_path'],
                zerver_attachment=attachments,
            )

        def add_upload(info: Dict[str, Any]) -> None:
            target_path = info['target_path']
            upload_rec = dict(
                size=info['size'],
                user_profile_id=info['sender_id'],
                realm_id=realm_id,
                s3_path=target_path,
                path=target_path,
                content_type=None,
            )
            uploads_records.append(upload_rec)

        def make_full_target_path(info: Dict[str, Any]) -> str:
            target_path = info['target_path']
            full_target_path = os.path.join(
                output_dir,
                'uploads',
                target_path,
            )
            full_target_path = os.path.abspath(full_target_path)
            os.makedirs(os.path.dirname(full_target_path), exist_ok=True)
            return full_target_path

        def copy_file(info: Dict[str, Any]) -> None:
            source_path = info['local_fn']
            target_path = make_full_target_path(info)
            shutil.copyfile(source_path, target_path)

        logging.info('Start processing attachment files')

        for info in self.info_dict.values():
            add_attachment(info)
            add_upload(info)
            copy_file(info)

        uploads_folder = os.path.join(output_dir, 'uploads')
        os.makedirs(os.path.join(uploads_folder, str(realm_id)), exist_ok=True)

        attachment = dict(
            zerver_attachment=attachments,
        )

        create_converted_data_files(uploads_records, output_dir, '/uploads/records.json')
        create_converted_data_files(attachment, output_dir, '/attachment.json')

        logging.info('Done processing attachment files')
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00			`import logging`
			`import os`
python: Sort imports with isort. Fixes #2665. Regenerated by tabbott with `lint --fix` after a rebase and change in parameters. Note from tabbott: In a few cases, this converts technical debt in the form of unsorted imports into different technical debt in the form of our largest files having very long, ugly import sequences at the start. I expect this change will increase pressure for us to split those files, which isn't a bad thing. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-11 00:54:34 +02:00			`import shutil`
			`from typing import Any, Dict, List, Optional`
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00
python: Sort imports with isort. Fixes #2665. Regenerated by tabbott with `lint --fix` after a rebase and change in parameters. Note from tabbott: In a few cases, this converts technical debt in the form of unsorted imports into different technical debt in the form of our largest files having very long, ugly import sequences at the start. I expect this change will increase pressure for us to split those files, which isn't a bad thing. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-11 00:54:34 +02:00			`from zerver.data_import.import_util import build_attachment, create_converted_data_files`
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00

			`class AttachmentHandler:`
			`def __init__(self) -> None:`
python: Rewrite dict() as {}. Suggested by the flake8-comprehensions plugin. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-09-02 08:14:51 +02:00			`self.info_dict: Dict[str, Dict[str, Any]] = {}`
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00
			`def handle_message_data(self,`
			`realm_id: int,`
			`message_id: int,`
			`sender_id: int,`
			`attachment: Dict[str, Any],`
			`files_dir: str) -> Optional[str]:`
			`if not attachment:`
			`return None`

hipchat: Skip attachments without paths. This is a short term workaround. Some variants of HipChat exports are missing `path`, and we just punt for now. 2018-11-14 00:52:17 +01:00			`name = attachment['name']`

			`if 'path' not in attachment:`
			`logging.info('Skipping HipChat attachment with missing path data: ' + name)`
			`return None`

hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00			`size = attachment['size']`
			`path = attachment['path']`

			`local_fn = os.path.join(files_dir, path)`

hipchat: Just skip over missing attachments. It seems like we get a lot of exports with bad attachment data, and some folks don't necessarily care, so we just skip for now. 2018-11-17 16:34:00 +01:00			`if not os.path.exists(local_fn):`
			`# HipChat has an option to not include these in its`
			`# exports, since file uploads can be very large.`
			`logging.info('Skipping attachment with no file data: ' + local_fn)`
			`return None`

hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00			`target_path = os.path.join(`
			`str(realm_id),`
			`'HipChatImportAttachment',`
python: Use trailing commas consistently. Automatically generated by the following script, based on the output of lint with flake8-comma: import re import sys last_filename = None last_row = None lines = [] for msg in sys.stdin: m = re.match( r"\x1b\[35mflake8 \\|\x1b\[0m \x1b\[1;31m(.+):(\d+):(\d+): (\w+)", msg ) if m: filename, row_str, col_str, err = m.groups() row, col = int(row_str), int(col_str) if filename == last_filename: assert last_row != row else: if last_filename is not None: with open(last_filename, "w") as f: f.writelines(lines) with open(filename) as f: lines = f.readlines() last_filename = filename last_row = row line = lines[row - 1] if err in ["C812", "C815"]: lines[row - 1] = line[: col - 1] + "," + line[col - 1 :] elif err in ["C819"]: assert line[col - 2] == "," lines[row - 1] = line[: col - 2] + line[col - 1 :].lstrip(" ") if last_filename is not None: with open(last_filename, "w") as f: f.writelines(lines) Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2020-04-10 05:23:40 +02:00			`path,`
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00			`)`

			`if target_path in self.info_dict:`
			`logging.info("file used multiple times: " + path)`
			`info = self.info_dict[target_path]`
			`info['message_ids'].add(message_id)`
			`return info['content']`

			`# HipChat provides size info, but it's not`
			`# completely trustworthy, so we we just`
			`# ask the OS for file details.`
			`size = os.path.getsize(local_fn)`
			`mtime = os.path.getmtime(local_fn)`

python: Convert more "".format to Python 3.6 f-strings. Generated by pyupgrade --py36-plus --keep-percent-format, with more restrictions patched out. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-10 06:40:53 +02:00			`content = f'[{name}](/user_uploads/{target_path})'`
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00
			`info = dict(`
			`message_ids={message_id},`
			`sender_id=sender_id,`
			`local_fn=local_fn,`
			`target_path=target_path,`
			`name=name,`
			`size=size,`
			`mtime=mtime,`
			`content=content,`
			`)`
			`self.info_dict[target_path] = info`

			`return content`

			`def write_info(self, output_dir: str, realm_id: int) -> None:`
python: Convert assignment type annotations to Python 3.6 style. This commit was split by tabbott; this piece covers the vast majority of files in Zulip, but excludes scripts/, tools/, and puppet/ to help ensure we at least show the right error messages for Xenial systems. We can likely further refine the remaining pieces with some testing. Generated by com2ann, with whitespace fixes and various manual fixes for runtime issues: - invoiced_through: Optional[LicenseLedger] = models.ForeignKey( + invoiced_through: Optional["LicenseLedger"] = models.ForeignKey( -_apns_client: Optional[APNsClient] = None +_apns_client: Optional["APNsClient"] = None - notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) - signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) + notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) + signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE) - author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE) + author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE) - bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL) + bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL) - default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) - default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) + default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) + default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE) -descriptors_by_handler_id: Dict[int, ClientDescriptor] = {} +descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {} -worker_classes: Dict[str, Type[QueueProcessingWorker]] = {} -queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {} +worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {} +queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {} -AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None +AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2020-04-22 01:09:50 +02:00			`attachments: List[Dict[str, Any]] = []`
			`uploads_records: List[Dict[str, Any]] = []`
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00
			`def add_attachment(info: Dict[str, Any]) -> None:`
			`build_attachment(`
			`realm_id=realm_id,`
			`message_ids=info['message_ids'],`
			`user_id=info['sender_id'],`
			`fileinfo=dict(`
			`created=info['mtime'], # minor lie`
			`size=info['size'],`
			`name=info['name'],`
			`),`
			`s3_path=info['target_path'],`
			`zerver_attachment=attachments,`
			`)`

			`def add_upload(info: Dict[str, Any]) -> None:`
			`target_path = info['target_path']`
			`upload_rec = dict(`
			`size=info['size'],`
			`user_profile_id=info['sender_id'],`
			`realm_id=realm_id,`
			`s3_path=target_path,`
			`path=target_path,`
			`content_type=None,`
			`)`
			`uploads_records.append(upload_rec)`

			`def make_full_target_path(info: Dict[str, Any]) -> str:`
			`target_path = info['target_path']`
			`full_target_path = os.path.join(`
			`output_dir,`
			`'uploads',`
			`target_path,`
			`)`
			`full_target_path = os.path.abspath(full_target_path)`
			`os.makedirs(os.path.dirname(full_target_path), exist_ok=True)`
			`return full_target_path`

			`def copy_file(info: Dict[str, Any]) -> None:`
			`source_path = info['local_fn']`
			`target_path = make_full_target_path(info)`
			`shutil.copyfile(source_path, target_path)`

			`logging.info('Start processing attachment files')`

			`for info in self.info_dict.values():`
			`add_attachment(info)`
			`add_upload(info)`
			`copy_file(info)`

			`uploads_folder = os.path.join(output_dir, 'uploads')`
			`os.makedirs(os.path.join(uploads_folder, str(realm_id)), exist_ok=True)`

			`attachment = dict(`
python: Use trailing commas consistently. Automatically generated by the following script, based on the output of lint with flake8-comma: import re import sys last_filename = None last_row = None lines = [] for msg in sys.stdin: m = re.match( r"\x1b\[35mflake8 \\|\x1b\[0m \x1b\[1;31m(.+):(\d+):(\d+): (\w+)", msg ) if m: filename, row_str, col_str, err = m.groups() row, col = int(row_str), int(col_str) if filename == last_filename: assert last_row != row else: if last_filename is not None: with open(last_filename, "w") as f: f.writelines(lines) with open(filename) as f: lines = f.readlines() last_filename = filename last_row = row line = lines[row - 1] if err in ["C812", "C815"]: lines[row - 1] = line[: col - 1] + "," + line[col - 1 :] elif err in ["C819"]: assert line[col - 2] == "," lines[row - 1] = line[: col - 2] + line[col - 1 :].lstrip(" ") if last_filename is not None: with open(last_filename, "w") as f: f.writelines(lines) Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2020-04-10 05:23:40 +02:00			`zerver_attachment=attachments,`
hipchat import: Support attachments. 2018-10-13 16:25:44 +02:00			`)`

			`create_converted_data_files(uploads_records, output_dir, '/uploads/records.json')`
			`create_converted_data_files(attachment, output_dir, '/attachment.json')`

			`logging.info('Done processing attachment files')`