2023-03-13 20:25:08 +01:00
import csv
2022-11-07 21:10:28 +01:00
import os
2023-03-13 21:46:56 +01:00
import shutil
2022-11-07 21:10:28 +01:00
from argparse import ArgumentParser
2022-11-07 21:43:18 +01:00
from datetime import datetime , timezone
2022-11-07 21:10:28 +01:00
from email . headerregistry import Address
from functools import lru_cache , reduce
from operator import or_
2023-03-13 21:46:56 +01:00
from typing import Any , Dict , Set , Tuple
2022-11-07 21:10:28 +01:00
2023-03-13 20:25:08 +01:00
import orjson
2022-11-07 21:10:28 +01:00
from django . core . management . base import CommandError
from django . db . models import Q
2023-10-12 19:43:45 +02:00
from typing_extensions import override
2022-11-07 21:10:28 +01:00
from zerver . lib . management import ZulipBaseCommand
2023-03-13 20:22:22 +01:00
from zerver . lib . soft_deactivation import reactivate_user_if_soft_deactivated
2023-03-13 21:46:56 +01:00
from zerver . lib . upload import save_attachment_contents
2024-05-24 18:09:00 +02:00
from zerver . models import AbstractUserMessage , Attachment , Message , Recipient , Stream , UserProfile
2024-02-25 03:47:15 +01:00
from zerver . models . recipients import get_or_create_huddle
2023-12-15 01:16:00 +01:00
from zerver . models . users import get_user_by_delivery_email
2023-03-13 21:46:56 +01:00
def write_attachment ( base_path : str , attachment : Attachment ) - > None :
dir_path_id = os . path . dirname ( attachment . path_id )
assert " ../ " not in dir_path_id
os . makedirs ( base_path + " / " + dir_path_id , exist_ok = True )
with open ( base_path + " / " + attachment . path_id , " wb " ) as attachment_file :
save_attachment_contents ( attachment . path_id , attachment_file )
2022-11-07 21:10:28 +01:00
class Command ( ZulipBaseCommand ) :
2023-03-13 20:22:22 +01:00
help = """ Exports the messages matching certain search terms, or from
senders / recipients .
2022-11-07 21:10:28 +01:00
This is most often used for legal compliance .
"""
2023-10-12 19:43:45 +02:00
@override
2022-11-07 21:10:28 +01:00
def add_arguments ( self , parser : ArgumentParser ) - > None :
self . add_realm_args ( parser , required = True )
parser . add_argument (
" --output " ,
metavar = " <path> " ,
2023-03-13 20:25:08 +01:00
help = " File to output JSON/CSV results to; it must not exist, unless --force is given " ,
2022-11-07 21:10:28 +01:00
required = True ,
)
2023-03-13 21:46:56 +01:00
parser . add_argument (
" --write-attachments " ,
metavar = " <directory> " ,
help = " If provided, export all referenced attachments into the directory " ,
)
2022-11-07 21:10:28 +01:00
parser . add_argument (
" --force " , action = " store_true " , help = " Overwrite the output file if it exists already "
)
parser . add_argument (
" --file " ,
metavar = " <path> " ,
help = " Read search terms from the named file, one per line " ,
)
parser . add_argument (
" search_terms " ,
nargs = " * " ,
metavar = " <search term> " ,
help = " Terms to search for in message body or topic " ,
)
2022-11-07 21:43:18 +01:00
parser . add_argument (
" --after " ,
metavar = " <datetime> " ,
help = " Limit to messages on or after this ISO datetime, treated as UTC " ,
type = lambda s : datetime . fromisoformat ( s ) . astimezone ( timezone . utc ) ,
)
parser . add_argument (
" --before " ,
metavar = " <datetime> " ,
help = " Limit to messages on or before this ISO datetime, treated as UTC " ,
type = lambda s : datetime . fromisoformat ( s ) . astimezone ( timezone . utc ) ,
)
2023-03-13 20:22:22 +01:00
users = parser . add_mutually_exclusive_group ( )
users . add_argument (
2022-11-17 18:34:35 +01:00
" --sender " ,
action = " append " ,
metavar = " <email> " ,
help = " Limit to messages sent by users with any of these emails (may be specified more than once) " ,
)
2023-03-13 20:22:22 +01:00
users . add_argument (
" --recipient " ,
action = " append " ,
metavar = " <email> " ,
help = " Limit to messages received by users with any of these emails (may be specified more than once). This is a superset of --sender, since senders receive every message they send. " ,
)
2024-02-25 03:47:15 +01:00
users . add_argument (
" --dm " ,
action = " append " ,
metavar = " <email> " ,
2024-05-24 18:09:00 +02:00
help = " Limit to messages in a DM between all of the users provided. " ,
2024-02-25 03:47:15 +01:00
)
2022-11-07 21:10:28 +01:00
2023-10-12 19:43:45 +02:00
@override
2022-11-07 21:10:28 +01:00
def handle ( self , * args : Any , * * options : Any ) - > None :
terms = set ( )
if options [ " file " ] :
2022-12-04 08:54:25 +01:00
with open ( options [ " file " ] ) as f :
2022-11-07 21:10:28 +01:00
terms . update ( f . read ( ) . splitlines ( ) )
terms . update ( options [ " search_terms " ] )
2023-03-13 20:22:22 +01:00
if (
not terms
and not options [ " before " ]
and not options [ " after " ]
and not options [ " sender " ]
and not options [ " recipient " ]
2024-02-25 03:47:15 +01:00
and not options [ " dm " ]
2023-03-13 20:22:22 +01:00
) :
2022-11-07 21:43:18 +01:00
raise CommandError ( " One or more limits are required! " )
2022-11-07 21:10:28 +01:00
2023-03-13 20:25:08 +01:00
if not options [ " output " ] . endswith ( ( " .json " , " .csv " ) ) :
raise CommandError (
" Unknown file format: {options['output']} Only .csv and .json are supported "
)
2022-11-07 21:10:28 +01:00
if os . path . exists ( options [ " output " ] ) and not options [ " force " ] :
raise CommandError (
f " Output path ' { options [ ' output ' ] } ' already exists; use --force to overwrite "
)
2023-03-13 21:46:56 +01:00
if options [ " write_attachments " ] and os . path . exists ( options [ " write_attachments " ] ) :
if not options [ " force " ] :
raise CommandError (
f " Attachments output path ' { options [ ' write_attachments ' ] } ' already exists; use --force to overwrite "
)
shutil . rmtree ( options [ " write_attachments " ] )
2022-11-07 21:10:28 +01:00
realm = self . get_realm ( options )
2023-03-13 20:15:30 +01:00
assert realm is not None
2022-11-07 21:43:18 +01:00
limits = Q ( )
2022-11-07 21:10:28 +01:00
limits = reduce (
or_ ,
[ Q ( content__icontains = term ) | Q ( subject__icontains = term ) for term in terms ] ,
2022-11-07 21:43:18 +01:00
limits ,
2022-11-07 21:10:28 +01:00
)
2022-11-07 21:43:18 +01:00
if options [ " after " ] :
limits & = Q ( date_sent__gt = options [ " after " ] )
if options [ " before " ] :
limits & = Q ( date_sent__lt = options [ " before " ] )
2023-03-13 20:22:22 +01:00
if options [ " recipient " ] :
user_profiles = [ get_user_by_delivery_email ( e , realm ) for e in options [ " recipient " ] ]
for user_profile in user_profiles :
# Users need to not be long-term idle for the
# UserMessages to be a judge of which messages they
# received.
reactivate_user_if_soft_deactivated ( user_profile )
limits & = Q (
usermessage__user_profile_id__in = [ user_profile . id for user_profile in user_profiles ]
)
elif options [ " sender " ] :
2022-11-17 18:34:35 +01:00
limits & = reduce (
or_ ,
[ Q ( sender__delivery_email__iexact = e ) for e in options [ " sender " ] ] ,
)
2024-02-25 03:47:15 +01:00
elif options [ " dm " ] :
user_profiles = [ get_user_by_delivery_email ( e , realm ) for e in options [ " dm " ] ]
2024-05-24 18:09:00 +02:00
for user_profile in user_profiles :
reactivate_user_if_soft_deactivated ( user_profile )
if len ( user_profiles ) == 1 :
limits & = Q (
usermessage__user_profile_id = user_profiles [ 0 ] . id ,
usermessage__flags__andnz = AbstractUserMessage . flags . is_private . mask ,
)
elif len ( user_profiles ) == 2 :
2024-02-25 03:47:15 +01:00
user_a , user_b = user_profiles
limits & = Q ( recipient = user_a . recipient , sender = user_b ) | Q (
recipient = user_b . recipient , sender = user_a
)
else :
huddle = get_or_create_huddle ( [ user . id for user in user_profiles ] )
limits & = Q ( recipient = huddle . recipient )
2022-11-07 21:43:18 +01:00
2023-03-13 21:46:56 +01:00
attachments_written : Set [ str ] = set ( )
2022-11-07 21:10:28 +01:00
messages_query = Message . objects . filter ( limits , realm = realm ) . order_by ( " date_sent " )
2023-03-13 20:58:46 +01:00
print ( f " Exporting { len ( messages_query ) } messages... " )
2022-11-07 21:10:28 +01:00
2023-03-13 20:23:59 +01:00
@lru_cache ( maxsize = 1000 )
2022-11-07 21:10:28 +01:00
def format_sender ( full_name : str , delivery_email : str ) - > str :
return str ( Address ( display_name = full_name , addr_spec = delivery_email ) )
2023-03-13 20:23:59 +01:00
def format_full_recipient ( recipient_id : int , subject : str ) - > str :
recip_str , has_subject = format_recipient ( recipient_id )
if not has_subject :
return recip_str
return f " { recip_str } > { subject } "
@lru_cache ( maxsize = 1000 )
def format_recipient ( recipient_id : int ) - > Tuple [ str , bool ] :
2022-11-07 21:10:28 +01:00
recipient = Recipient . objects . get ( id = recipient_id )
if recipient . type == Recipient . STREAM :
stream = Stream . objects . values ( " name " ) . get ( id = recipient . type_id )
2023-03-13 20:23:59 +01:00
return " # " + stream [ " name " ] , True
2022-11-07 21:10:28 +01:00
users = (
UserProfile . objects . filter (
subscription__recipient_id = recipient . id ,
)
. order_by ( " full_name " )
. values_list ( " full_name " , " delivery_email " )
)
2023-09-12 23:19:57 +02:00
return " , " . join ( format_sender ( e [ 0 ] , e [ 1 ] ) for e in users ) , False
2022-11-07 21:10:28 +01:00
2023-03-13 20:25:08 +01:00
def transform_message ( message : Message ) - > Dict [ str , str ] :
2023-03-13 21:46:56 +01:00
row = {
2023-03-13 20:25:08 +01:00
" id " : str ( message . id ) ,
" timestamp (UTC) " : message . date_sent . astimezone ( timezone . utc ) . strftime (
" % Y- % m- %d % H: % M: % S "
) ,
" sender " : format_sender ( message . sender . full_name , message . sender . delivery_email ) ,
" recipient " : format_full_recipient ( message . recipient_id , message . subject ) ,
" content " : message . content ,
" edit history " : message . edit_history if message . edit_history is not None else " " ,
}
2023-03-13 21:46:56 +01:00
if options [ " write_attachments " ] :
if message . has_attachment :
attachments = message . attachment_set . all ( )
row [ " attachments " ] = " " . join ( a . path_id for a in attachments )
for attachment in attachments :
if attachment . path_id in attachments_written :
continue
write_attachment ( options [ " write_attachments " ] , attachment )
attachments_written . add ( attachment . path_id )
else :
row [ " attachments " ] = " "
return row
2023-03-13 20:25:08 +01:00
if options [ " output " ] . endswith ( " .json " ) :
with open ( options [ " output " ] , " wb " ) as json_file :
json_file . write (
# orjson doesn't support dumping from a generator
orjson . dumps (
[ transform_message ( m ) for m in messages_query ] , option = orjson . OPT_INDENT_2
)
)
elif options [ " output " ] . endswith ( " .csv " ) :
with open ( options [ " output " ] , " w " ) as csv_file :
2023-03-13 21:46:56 +01:00
columns = [
" id " ,
" timestamp (UTC) " ,
" sender " ,
" recipient " ,
" content " ,
" edit history " ,
]
if options [ " write_attachments " ] :
columns + = [ " attachments " ]
csvwriter = csv . DictWriter ( csv_file , columns )
2023-03-13 20:25:08 +01:00
csvwriter . writeheader ( )
csvwriter . writerows ( transform_message ( m ) for m in messages_query )