diff --git a/stubs/taint/false_positives.pysa b/stubs/taint/false_positives.pysa new file mode 100644 index 0000000000..d4e1ef8cfd --- /dev/null +++ b/stubs/taint/false_positives.pysa @@ -0,0 +1,59 @@ +# This function ensures that a redirect is only within the specified domain. +# Assuming that the domain isn't attacker controllable, the result is safe to +# redirect to +def zerver.views.auth.get_safe_redirect_to(url, redirect_host) -> Sanitize: ... + +# This function was previously the source of an open redirect, but has now been +# reviewed and patched, so the output should now be safe to redirect to, +# regardless of the value of the specified 'path'. +def zerver.lib.thumbnail.generate_thumbnail_url( + path, + size=..., + is_camo_url=... +) -> Sanitize: ... + +# This function returns a version of name that only contains word and space +# characters, or ., -, _ characters. This should be safe to put into URLs and +# filesystem operations. +def zerver.lib.upload.sanitize_name(value) -> Sanitize: ... + +# This function accepts two integers and then concatenates them into a path +# segment. The result should be safe for use in filesystem and other operations. +def zerver.lib.avatar_hash.user_avatar_path_from_ids(user_profile_id, realm_id) -> Sanitize: ... + +# This function creates a list of 'UserMessageLite' objects, which contain only +# integral IDs and flags. These should safe for use with SQL and other +# operations. +def zerver.lib.actions.create_user_messages( + message, + um_eligible_user_ids, + long_term_idle_user_ids, + stream_push_user_ids, + stream_email_user_ids, + mentioned_user_ids, + mark_as_read +) -> Sanitize: ... + +# This function is an identity function used for removing taint from variables +# when there is no convenient way to do it by annotating existing functions. +def zerver.lib.pysa.mark_sanitized(arg) -> Sanitize: ... + +############################ +# Overbroad Approximations # +############################ + +# Note that the below functions are overbroad approximations of Sanitizers and +# could lead to false negatives. They should be replaced with more specific +# feature-based filtering when that is available through SAPP. + +# This function generates a URL pointing to a valid Django endpoint, with +# arguments properly URL encoded. The resulting URL can usually be used as a +# part of a redirect or HTTP request without fear of open redirect or SSRF +# vulnerabilities respectively. +def django.urls.base.reverse( + viewname, + urlconf=..., + args=..., + kwargs=..., + current_app=... +) -> Sanitize: ... diff --git a/stubs/taint/req_lib.pysa b/stubs/taint/req_lib.pysa new file mode 100644 index 0000000000..327e83ff9e --- /dev/null +++ b/stubs/taint/req_lib.pysa @@ -0,0 +1,4 @@ +# One of the ways user-controlled data enters the application is through the +# request variables framework. This model teaches Pysa that every instance of +# 'REQ()' in a view function is a source of UserControlled taint. +class zerver.lib.request._REQ(TaintSource[UserControlled]): ... diff --git a/zerver/lib/actions.py b/zerver/lib/actions.py index 0dc4d5d9f6..b17d5b40a6 100644 --- a/zerver/lib/actions.py +++ b/zerver/lib/actions.py @@ -50,6 +50,7 @@ from zerver.lib.message import ( truncate_body, truncate_topic, ) +from zerver.lib.pysa import mark_sanitized from zerver.lib.realm_icon import realm_icon_url from zerver.lib.realm_logo import get_realm_logo_data from zerver.lib.retention import move_messages_to_archive @@ -5146,6 +5147,11 @@ def check_add_realm_emoji(realm: Realm, realm_emoji.save() emoji_file_name = get_emoji_file_name(image_file.name, realm_emoji.id) + + # The only user-controlled portion of 'emoji_file_name' is an extension, + # which can not contain '..' or '/' or '\', making it difficult to exploit + emoji_file_name = mark_sanitized(emoji_file_name) + emoji_uploaded_successfully = False try: upload_emoji_image(image_file, emoji_file_name, author) diff --git a/zerver/lib/export.py b/zerver/lib/export.py index 163976bbe4..722e964631 100644 --- a/zerver/lib/export.py +++ b/zerver/lib/export.py @@ -31,6 +31,8 @@ from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \ CustomProfileFieldValue, get_display_recipient, Attachment, get_system_bot, \ RealmAuditLog, UserHotspot, MutedTopic, Service, UserGroup, \ UserGroupMembership, BotStorageData, BotConfigData +from zerver.lib.pysa import mark_sanitized + import zerver.lib.upload from typing import Any, Callable, Dict, List, Optional, Set, Tuple, \ Union @@ -1239,7 +1241,11 @@ def _save_s3_object_to_file(key: ServiceResource, output_dir: str, processing_av if "../" in filename: raise AssertionError(f"Suspicious file with invalid format {filename}") - dirname = os.path.dirname(filename) + # Use 'mark_sanitized' to cause Pysa to ignore the flow of user controlled + # data into the filesystem sink, because we've already prevented directory + # traversal with our assertion above. + dirname = mark_sanitized(os.path.dirname(filename)) + if not os.path.exists(dirname): os.makedirs(dirname) key.download_file(filename) @@ -1303,16 +1309,22 @@ def export_uploads_from_local(realm: Realm, local_dir: Path, output_dir: Path) - count = 0 records = [] for attachment in Attachment.objects.filter(realm_id=realm.id): - local_path = os.path.join(local_dir, attachment.path_id) - output_path = os.path.join(output_dir, attachment.path_id) + # Use 'mark_sanitized' to work around false positive caused by Pysa + # thinking that 'realm' (and thus 'attachment' and 'attachment.path_id') + # are user controlled + path_id = mark_sanitized(attachment.path_id) + + local_path = os.path.join(local_dir, path_id) + output_path = os.path.join(output_dir, path_id) + os.makedirs(os.path.dirname(output_path), exist_ok=True) shutil.copy2(local_path, output_path) stat = os.stat(local_path) record = dict(realm_id=attachment.realm_id, user_profile_id=attachment.owner.id, user_profile_email=attachment.owner.email, - s3_path=attachment.path_id, - path=attachment.path_id, + s3_path=path_id, + path=path_id, size=stat.st_size, last_modified=stat.st_mtime, content_type=None) @@ -1398,8 +1410,15 @@ def export_emoji_from_local(realm: Realm, local_dir: Path, output_dir: Path) -> realm_id=realm.id, emoji_file_name=realm_emoji.file_name ) + + # Use 'mark_sanitized' to work around false positive caused by Pysa + # thinking that 'realm' (and thus 'attachment' and 'attachment.path_id') + # are user controlled + emoji_path = mark_sanitized(emoji_path) + local_path = os.path.join(local_dir, emoji_path) output_path = os.path.join(output_dir, emoji_path) + os.makedirs(os.path.dirname(output_path), exist_ok=True) shutil.copy2(local_path, output_path) # Realm Emoji author is optional. diff --git a/zerver/lib/pysa.py b/zerver/lib/pysa.py new file mode 100644 index 0000000000..6019b29378 --- /dev/null +++ b/zerver/lib/pysa.py @@ -0,0 +1,8 @@ +from typing import TypeVar + + +T = TypeVar("T") + + +def mark_sanitized(arg: T) -> T: + return arg diff --git a/zerver/lib/unminify.py b/zerver/lib/unminify.py index 28b0b6b680..d6b41b720f 100644 --- a/zerver/lib/unminify.py +++ b/zerver/lib/unminify.py @@ -4,6 +4,8 @@ import sourcemap from typing import Dict, List +from zerver.lib.pysa import mark_sanitized + class SourceMap: '''Map (line, column) pairs from generated to source file.''' @@ -15,11 +17,25 @@ class SourceMap: def _index_for(self, minified_src: str) -> sourcemap.SourceMapDecoder: '''Return the source map index for minified_src, loading it if not already loaded.''' + + # Prevent path traversal + assert ".." not in minified_src and "/" not in minified_src + if minified_src not in self._indices: for source_dir in self._dirs: filename = os.path.join(source_dir, minified_src + '.map') if os.path.isfile(filename): - with open(filename) as fp: + # Use 'mark_sanitized' to force Pysa to ignore the fact that + # 'filename' is user controlled. While putting user + # controlled data into a filesystem operation is bad, in + # this case it's benign because 'filename' can't traverse + # directories outside of the pre-configured 'sourcemap_dirs' + # (due to the above assertions) and will always end in + # '.map'. Additionally, the result of this function is used + # for error logging and not returned to the user, so + # controlling the loaded file would not be useful to an + # attacker. + with open(mark_sanitized(filename)) as fp: self._indices[minified_src] = sourcemap.load(fp) break diff --git a/zerver/lib/upload.py b/zerver/lib/upload.py index 030db29994..64b7988824 100644 --- a/zerver/lib/upload.py +++ b/zerver/lib/upload.py @@ -98,7 +98,9 @@ def sanitize_name(value: str) -> str: """ value = unicodedata.normalize('NFKC', value) value = re.sub(r'[^\w\s._-]', '', value, flags=re.U).strip() - return mark_safe(re.sub(r'[-\s]+', '-', value, flags=re.U)) + value = re.sub(r'[-\s]+', '-', value, flags=re.U) + assert value not in {'', '.', '..'} + return mark_safe(value) def random_name(bytes: int=60) -> str: return base64.urlsafe_b64encode(os.urandom(bytes)).decode('utf-8') @@ -653,6 +655,7 @@ class S3UploadBackend(ZulipUploadBackend): def write_local_file(type: str, path: str, file_data: bytes) -> None: file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, type, path) + os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'wb') as f: f.write(file_data) diff --git a/zerver/lib/url_encoding.py b/zerver/lib/url_encoding.py index a5d59d031e..3bd12dd276 100644 --- a/zerver/lib/url_encoding.py +++ b/zerver/lib/url_encoding.py @@ -1,6 +1,7 @@ import urllib from typing import Any, Dict, List +from zerver.lib.pysa import mark_sanitized from zerver.lib.topic import get_topic_from_message_info from zerver.models import Realm, Stream, UserProfile @@ -97,8 +98,12 @@ def near_pm_message_url(realm: Realm, return full_url def add_query_to_redirect_url(original_url: str, query: str) -> str: - return original_url + "?" + query + # Using 'mark_sanitized' because user-controlled data after the '?' is + # not relevant for open redirects + return original_url + "?" + mark_sanitized(query) def add_query_arg_to_redirect_url(original_url: str, query_arg: str) -> str: assert '?' in original_url - return original_url + "&" + query_arg + # Using 'mark_sanitized' because user-controlled data after the '?' is + # not relevant for open redirects + return original_url + "&" + mark_sanitized(query_arg) diff --git a/zerver/lib/url_preview/preview.py b/zerver/lib/url_preview/preview.py index 25bccdf000..f69d574043 100644 --- a/zerver/lib/url_preview/preview.py +++ b/zerver/lib/url_preview/preview.py @@ -9,6 +9,7 @@ from typing.re import Match from version import ZULIP_VERSION from zerver.lib.cache import cache_with_key, get_cache_with_key, preview_url_cache_key +from zerver.lib.pysa import mark_sanitized from zerver.lib.url_preview.oembed import get_oembed_data from zerver.lib.url_preview.parsers import OpenGraphParser, GenericParser @@ -88,7 +89,7 @@ def get_link_embed_data(url: str, if data.get('oembed'): return data - response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT) + response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT) if response.ok: og_data = OpenGraphParser(response.text).extract_data() for key in ['title', 'description', 'image']: diff --git a/zerver/models.py b/zerver/models.py index 5d65b10baa..228f6a1dcd 100644 --- a/zerver/models.py +++ b/zerver/models.py @@ -28,6 +28,7 @@ from zerver.lib.timestamp import datetime_to_timestamp from django.db.models.signals import post_save, post_delete from django.utils.translation import ugettext_lazy as _ from zerver.lib import cache +from zerver.lib.pysa import mark_sanitized from zerver.lib.validator import check_int, \ check_short_string, check_long_string, validate_choice_field, check_date, \ check_url, check_list @@ -518,7 +519,9 @@ class Realm(models.Model): @property def host(self) -> str: - return self.host_for_subdomain(self.subdomain) + # Use mark sanitized to prevent false positives from Pysa thinking that + # the host is user controlled. + return mark_sanitized(self.host_for_subdomain(self.subdomain)) @staticmethod def host_for_subdomain(subdomain: str) -> str: diff --git a/zerver/views/auth.py b/zerver/views/auth.py index 2b07424737..45e717d525 100644 --- a/zerver/views/auth.py +++ b/zerver/views/auth.py @@ -30,6 +30,7 @@ from zerver.forms import HomepageForm, OurAuthenticationForm, \ AuthenticationTokenForm from zerver.lib.mobile_auth_otp import otp_encrypt_api_key from zerver.lib.push_notifications import push_notifications_enabled +from zerver.lib.pysa import mark_sanitized from zerver.lib.realm_icon import realm_icon_url from zerver.lib.request import REQ, has_request_variables, JsonableError from zerver.lib.response import json_success, json_error @@ -64,7 +65,10 @@ ExtraContext = Optional[Dict[str, Any]] def get_safe_redirect_to(url: str, redirect_host: str) -> str: is_url_safe = is_safe_url(url=url, allowed_hosts=None) if is_url_safe: - return urllib.parse.urljoin(redirect_host, url) + # Mark as safe to prevent Pysa from surfacing false positives for + # open redirects. In this branch, we have already checked that the URL + # points to the specified 'redirect_host', or is relative. + return urllib.parse.urljoin(redirect_host, mark_sanitized(url)) else: return redirect_host @@ -186,7 +190,10 @@ def maybe_send_to_registration(request: HttpRequest, email: str, full_name: str= host = realm.host else: host = request.get_host() - confirmation_link = create_confirmation_link(prereg_user, host, + # Mark 'host' as safe for use in a redirect. It's pulled from the + # current request or realm, both of which only allow a limited set of + # trusted hosts. + confirmation_link = create_confirmation_link(prereg_user, mark_sanitized(host), Confirmation.USER_REGISTRATION) if is_signup: return redirect(confirmation_link) diff --git a/zerver/views/registration.py b/zerver/views/registration.py index 721811aec6..d9d6f460da 100644 --- a/zerver/views/registration.py +++ b/zerver/views/registration.py @@ -27,6 +27,7 @@ from zerver.decorator import require_post, \ from zerver.lib.create_user import get_role_for_new_user from zerver.lib.onboarding import send_initial_realm_messages, setup_realm_internal_bots from zerver.lib.sessions import get_expirable_session_var +from zerver.lib.pysa import mark_sanitized from zerver.lib.subdomains import get_subdomain, is_root_domain_available from zerver.lib.timezone import get_all_timezones from zerver.lib.url_encoding import add_query_to_redirect_url @@ -408,7 +409,9 @@ def login_and_go_to_home(request: HttpRequest, user_profile: UserProfile) -> Htt return finish_desktop_flow(request, user_profile, desktop_flow_otp) do_login(request, user_profile) - return HttpResponseRedirect(user_profile.realm.uri + reverse('zerver.views.home.home')) + # Using 'mark_sanitized' to work around false positive where Pysa thinks + # that 'user_profile' is user-controlled + return HttpResponseRedirect(mark_sanitized(user_profile.realm.uri) + reverse('zerver.views.home.home')) def prepare_activation_url(email: str, request: HttpRequest, realm_creation: bool=False, diff --git a/zerver/views/video_calls.py b/zerver/views/video_calls.py index 567dd7c63f..f9574a8194 100644 --- a/zerver/views/video_calls.py +++ b/zerver/views/video_calls.py @@ -19,6 +19,7 @@ from requests_oauthlib import OAuth2Session from zerver.decorator import REQ, has_request_variables, zulip_login_required from zerver.lib.actions import do_set_zoom_token from zerver.lib.exceptions import ErrorCode, JsonableError +from zerver.lib.pysa import mark_sanitized from zerver.lib.response import json_success from zerver.lib.subdomains import get_subdomain from zerver.lib.validator import check_dict, check_string @@ -57,7 +58,10 @@ def get_zoom_sid(request: HttpRequest) -> str: # token directly to the Zoom server. csrf.get_token(request) - return ( + # Use 'mark_sanitized' to cause Pysa to ignore the flow of user controlled + # data out of this function. 'request.META' is indeed user controlled, but + # post-HMAC ouptut is no longer meaningfully controllable. + return mark_sanitized( "" if getattr(request, "_dont_enforce_csrf_checks", False) else salted_hmac("Zulip Zoom sid", request.META["CSRF_COOKIE"]).hexdigest() diff --git a/zerver/views/zephyr.py b/zerver/views/zephyr.py index 0b1d345a55..99db0796ae 100644 --- a/zerver/views/zephyr.py +++ b/zerver/views/zephyr.py @@ -3,6 +3,7 @@ from django.http import HttpResponse, HttpRequest from django.utils.translation import ugettext as _ from zerver.decorator import authenticated_json_view from zerver.lib.ccache import make_ccache +from zerver.lib.pysa import mark_sanitized from zerver.lib.request import has_request_variables, REQ from zerver.lib.response import json_success, json_error from zerver.lib.users import get_api_key @@ -43,6 +44,14 @@ def webathena_kerberos_login(request: HttpRequest, user_profile: UserProfile, # This is important for security since DNS is not secure. assert(re.match(r'^[a-z0-9_.-]+$', user) is not None) ccache = make_ccache(parsed_cred) + + # 'user' has been verified to contain only benign characters that won't + # help with shell injection. + user = mark_sanitized(user) + + # 'ccache' is only written to disk by the script and used as a kerberos + # credential cache file. + ccache = mark_sanitized(ccache) except Exception: return json_error(_("Invalid Kerberos cache")) diff --git a/zerver/worker/queue_processors.py b/zerver/worker/queue_processors.py index 1e76dbf760..0d87f6171c 100644 --- a/zerver/worker/queue_processors.py +++ b/zerver/worker/queue_processors.py @@ -45,6 +45,7 @@ from zerver.lib.bot_lib import EmbeddedBotHandler, get_bot_handler, EmbeddedBotQ from zerver.lib.exceptions import RateLimited from zerver.lib.export import export_realm_wrapper from zerver.lib.remote_server import PushNotificationBouncerRetryLaterError +from zerver.lib.pysa import mark_sanitized import os import ujson @@ -204,7 +205,9 @@ class QueueProcessingWorker(ABC): self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) # nocoverage - fname = f'{self.queue_name}.errors' + # Use 'mark_sanitized' to prevent Pysa from detecting this false positive + # flow. 'queue_name' is always a constant string. + fname = mark_sanitized(f'{self.queue_name}.errors') fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = f'{time.asctime()}\t{ujson.dumps(events)}\n' lock_fn = fn + '.lock'