2022-11-07 21:10:28 +01:00
import os
from argparse import ArgumentParser
2022-11-07 21:43:18 +01:00
from datetime import datetime , timezone
2022-11-07 21:10:28 +01:00
from email . headerregistry import Address
from functools import lru_cache , reduce
from operator import or_
from typing import Any
from django . core . management . base import CommandError
from django . db . models import Q
from django . forms . models import model_to_dict
from zerver . lib . export import floatify_datetime_fields , write_table_data
from zerver . lib . management import ZulipBaseCommand
2023-03-13 20:22:22 +01:00
from zerver . lib . soft_deactivation import reactivate_user_if_soft_deactivated
from zerver . models import Message , Recipient , Stream , UserProfile , get_user_by_delivery_email
2022-11-07 21:10:28 +01:00
ignore_keys = [
" realm " ,
" rendered_content_version " ,
" sending_client " ,
" search_tsvector " ,
]
class Command ( ZulipBaseCommand ) :
2023-03-13 20:22:22 +01:00
help = """ Exports the messages matching certain search terms, or from
senders / recipients .
2022-11-07 21:10:28 +01:00
This is most often used for legal compliance .
"""
def add_arguments ( self , parser : ArgumentParser ) - > None :
self . add_realm_args ( parser , required = True )
parser . add_argument (
" --output " ,
metavar = " <path> " ,
help = " File to output JSON results to; it must not exist, unless --force is given " ,
required = True ,
)
parser . add_argument (
" --force " , action = " store_true " , help = " Overwrite the output file if it exists already "
)
parser . add_argument (
" --file " ,
metavar = " <path> " ,
help = " Read search terms from the named file, one per line " ,
)
parser . add_argument (
" search_terms " ,
nargs = " * " ,
metavar = " <search term> " ,
help = " Terms to search for in message body or topic " ,
)
2022-11-07 21:43:18 +01:00
parser . add_argument (
" --after " ,
metavar = " <datetime> " ,
help = " Limit to messages on or after this ISO datetime, treated as UTC " ,
type = lambda s : datetime . fromisoformat ( s ) . astimezone ( timezone . utc ) ,
)
parser . add_argument (
" --before " ,
metavar = " <datetime> " ,
help = " Limit to messages on or before this ISO datetime, treated as UTC " ,
type = lambda s : datetime . fromisoformat ( s ) . astimezone ( timezone . utc ) ,
)
2023-03-13 20:22:22 +01:00
users = parser . add_mutually_exclusive_group ( )
users . add_argument (
2022-11-17 18:34:35 +01:00
" --sender " ,
action = " append " ,
metavar = " <email> " ,
help = " Limit to messages sent by users with any of these emails (may be specified more than once) " ,
)
2023-03-13 20:22:22 +01:00
users . add_argument (
" --recipient " ,
action = " append " ,
metavar = " <email> " ,
help = " Limit to messages received by users with any of these emails (may be specified more than once). This is a superset of --sender, since senders receive every message they send. " ,
)
2022-11-07 21:10:28 +01:00
def handle ( self , * args : Any , * * options : Any ) - > None :
terms = set ( )
if options [ " file " ] :
2022-12-04 08:54:25 +01:00
with open ( options [ " file " ] ) as f :
2022-11-07 21:10:28 +01:00
terms . update ( f . read ( ) . splitlines ( ) )
terms . update ( options [ " search_terms " ] )
2023-03-13 20:22:22 +01:00
if (
not terms
and not options [ " before " ]
and not options [ " after " ]
and not options [ " sender " ]
and not options [ " recipient " ]
) :
2022-11-07 21:43:18 +01:00
raise CommandError ( " One or more limits are required! " )
2022-11-07 21:10:28 +01:00
if os . path . exists ( options [ " output " ] ) and not options [ " force " ] :
raise CommandError (
f " Output path ' { options [ ' output ' ] } ' already exists; use --force to overwrite "
)
realm = self . get_realm ( options )
2023-03-13 20:15:30 +01:00
assert realm is not None
2022-11-07 21:43:18 +01:00
limits = Q ( )
2022-11-07 21:10:28 +01:00
limits = reduce (
or_ ,
[ Q ( content__icontains = term ) | Q ( subject__icontains = term ) for term in terms ] ,
2022-11-07 21:43:18 +01:00
limits ,
2022-11-07 21:10:28 +01:00
)
2022-11-07 21:43:18 +01:00
if options [ " after " ] :
limits & = Q ( date_sent__gt = options [ " after " ] )
if options [ " before " ] :
limits & = Q ( date_sent__lt = options [ " before " ] )
2023-03-13 20:22:22 +01:00
if options [ " recipient " ] :
user_profiles = [ get_user_by_delivery_email ( e , realm ) for e in options [ " recipient " ] ]
for user_profile in user_profiles :
# Users need to not be long-term idle for the
# UserMessages to be a judge of which messages they
# received.
reactivate_user_if_soft_deactivated ( user_profile )
limits & = Q (
usermessage__user_profile_id__in = [ user_profile . id for user_profile in user_profiles ]
)
elif options [ " sender " ] :
2022-11-17 18:34:35 +01:00
limits & = reduce (
or_ ,
[ Q ( sender__delivery_email__iexact = e ) for e in options [ " sender " ] ] ,
)
2022-11-07 21:43:18 +01:00
2022-11-07 21:10:28 +01:00
messages_query = Message . objects . filter ( limits , realm = realm ) . order_by ( " date_sent " )
2023-03-13 20:58:46 +01:00
print ( f " Exporting { len ( messages_query ) } messages... " )
2022-11-07 21:10:28 +01:00
def format_sender ( full_name : str , delivery_email : str ) - > str :
return str ( Address ( display_name = full_name , addr_spec = delivery_email ) )
@lru_cache ( maxsize = None )
def format_recipient ( recipient_id : int ) - > str :
recipient = Recipient . objects . get ( id = recipient_id )
if recipient . type == Recipient . STREAM :
stream = Stream . objects . values ( " name " ) . get ( id = recipient . type_id )
return " # " + stream [ " name " ]
users = (
UserProfile . objects . filter (
subscription__recipient_id = recipient . id ,
)
. order_by ( " full_name " )
. values_list ( " full_name " , " delivery_email " )
)
return " , " . join ( [ format_sender ( e [ 0 ] , e [ 1 ] ) for e in users ] )
message_dicts = [ ]
for message in messages_query :
item = model_to_dict ( message )
item [ " recipient_name " ] = format_recipient ( message . recipient_id )
item [ " sender_name " ] = format_sender (
message . sender . full_name , message . sender . delivery_email
)
for key in ignore_keys :
del item [ key ]
message_dicts . append ( item )
output = { " zerver_message " : message_dicts }
floatify_datetime_fields ( output , " zerver_message " )
for item in output [ " zerver_message " ] :
2022-12-26 00:35:12 +01:00
item [ " date_sent_utc " ] = datetime . fromtimestamp (
int ( item [ " date_sent " ] ) , timezone . utc
) . strftime ( " % Y- % m- %d % H: % M: % S " )
2022-11-07 21:10:28 +01:00
write_table_data ( options [ " output " ] , output )