pysa: Introduce sanitizers, models, and inline marking safe.

This commit adds three `.pysa` model files: `false_positives.pysa` for ruling out false positive flows with `Sanitize` annotations, `req_lib.pysa` for educating pysa about Zulip's `REQ()` pattern for extracting user input, and `redirects.pysa` for capturing the risk of open redirects within Zulip code. Additionally, this commit introduces `mark_sanitized`, an identity function which can be used to selectively clear taint in cases where `Sanitize` models will not work. This commit also puts `mark_sanitized` to work removing known false postive flows.
2019-12-19 18:00:45 -05:00 · 2019-12-19 18:00:45 -05:00 · 461d5b1a3e
parent 89131bfcbb
commit 461d5b1a3e
15 changed files with 166 additions and 16 deletions
--- a/stubs/taint/false_positives.pysa
+++ b/stubs/taint/false_positives.pysa
@ -0,0 +1,59 @@
+# This function ensures that a redirect is only within the specified domain.
+# Assuming that the domain isn't attacker controllable, the result is safe to
+# redirect to
+def zerver.views.auth.get_safe_redirect_to(url, redirect_host) -> Sanitize: ...
+
+# This function was previously the source of an open redirect, but has now been
+# reviewed and patched, so the output should now be safe to redirect to,
+# regardless of the value of the specified 'path'.
+def zerver.lib.thumbnail.generate_thumbnail_url(
+    path,
+    size=...,
+    is_camo_url=...
+) -> Sanitize: ...
+
+# This function returns a version of name that only contains word and space
+# characters, or ., -, _ characters. This should be safe to put into URLs and
+# filesystem operations.
+def zerver.lib.upload.sanitize_name(value) -> Sanitize: ...
+
+# This function accepts two integers and then concatenates them into a path
+# segment. The result should be safe for use in filesystem and other operations.
+def zerver.lib.avatar_hash.user_avatar_path_from_ids(user_profile_id, realm_id) -> Sanitize: ...
+
+# This function creates a list of 'UserMessageLite' objects, which contain only
+# integral IDs and flags. These should safe for use with SQL and other
+# operations.
+def zerver.lib.actions.create_user_messages(
+    message,
+    um_eligible_user_ids,
+    long_term_idle_user_ids,
+    stream_push_user_ids,
+    stream_email_user_ids,
+    mentioned_user_ids,
+    mark_as_read
+) -> Sanitize: ...
+
+# This function is an identity function used for removing taint from variables
+# when there is no convenient way to do it by annotating existing functions.
+def zerver.lib.pysa.mark_sanitized(arg) -> Sanitize: ...
+
+############################
+# Overbroad Approximations #
+############################
+
+# Note that the below functions are overbroad approximations of Sanitizers and
+# could lead to false negatives. They should be replaced with more specific
+# feature-based filtering when that is available through SAPP.
+
+# This function generates a URL pointing to a valid Django endpoint, with
+# arguments properly URL encoded. The resulting URL can usually be used as a
+# part of a redirect or HTTP request without fear of open redirect or SSRF
+# vulnerabilities respectively.
+def django.urls.base.reverse(
+    viewname,
+    urlconf=...,
+    args=...,
+    kwargs=...,
+    current_app=...
+) -> Sanitize: ...
--- a/stubs/taint/req_lib.pysa
+++ b/stubs/taint/req_lib.pysa
@ -0,0 +1,4 @@
+# One of the ways user-controlled data enters the application is through the
+# request variables framework. This model teaches Pysa that every instance of
+# 'REQ()' in a view function is a source of UserControlled taint.
+class zerver.lib.request._REQ(TaintSource[UserControlled]): ...
--- a/zerver/lib/actions.py
+++ b/zerver/lib/actions.py
@ -50,6 +50,7 @@ from zerver.lib.message import (
    truncate_body,
    truncate_topic,
 )
+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.realm_icon import realm_icon_url
 from zerver.lib.realm_logo import get_realm_logo_data
 from zerver.lib.retention import move_messages_to_archive
@ -5146,6 +5147,11 @@ def check_add_realm_emoji(realm: Realm,
    realm_emoji.save()

    emoji_file_name = get_emoji_file_name(image_file.name, realm_emoji.id)
+
+    # The only user-controlled portion of 'emoji_file_name' is an extension,
+    # which can not contain '..' or '/' or '\', making it difficult to exploit
+    emoji_file_name = mark_sanitized(emoji_file_name)
+
    emoji_uploaded_successfully = False
    try:
        upload_emoji_image(image_file, emoji_file_name, author)
--- a/zerver/lib/export.py
+++ b/zerver/lib/export.py
@ -31,6 +31,8 @@ from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \
    CustomProfileFieldValue, get_display_recipient, Attachment, get_system_bot, \
    RealmAuditLog, UserHotspot, MutedTopic, Service, UserGroup, \
    UserGroupMembership, BotStorageData, BotConfigData
+from zerver.lib.pysa import mark_sanitized
+
 import zerver.lib.upload
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, \
    Union
@ -1239,7 +1241,11 @@ def _save_s3_object_to_file(key: ServiceResource, output_dir: str, processing_av
    if "../" in filename:
        raise AssertionError(f"Suspicious file with invalid format {filename}")

-    dirname = os.path.dirname(filename)
+    # Use 'mark_sanitized' to cause Pysa to ignore the flow of user controlled
+    # data into the filesystem sink, because we've already prevented directory
+    # traversal with our assertion above.
+    dirname = mark_sanitized(os.path.dirname(filename))
+
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    key.download_file(filename)
@ -1303,16 +1309,22 @@ def export_uploads_from_local(realm: Realm, local_dir: Path, output_dir: Path) -
    count = 0
    records = []
    for attachment in Attachment.objects.filter(realm_id=realm.id):
-        local_path = os.path.join(local_dir, attachment.path_id)
-        output_path = os.path.join(output_dir, attachment.path_id)
+        # Use 'mark_sanitized' to work around false positive caused by Pysa
+        # thinking that 'realm' (and thus 'attachment' and 'attachment.path_id')
+        # are user controlled
+        path_id = mark_sanitized(attachment.path_id)
+
+        local_path = os.path.join(local_dir, path_id)
+        output_path = os.path.join(output_dir, path_id)
+
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        shutil.copy2(local_path, output_path)
        stat = os.stat(local_path)
        record = dict(realm_id=attachment.realm_id,
                      user_profile_id=attachment.owner.id,
                      user_profile_email=attachment.owner.email,
-                      s3_path=attachment.path_id,
-                      path=attachment.path_id,
+                      s3_path=path_id,
+                      path=path_id,
                      size=stat.st_size,
                      last_modified=stat.st_mtime,
                      content_type=None)
@ -1398,8 +1410,15 @@ def export_emoji_from_local(realm: Realm, local_dir: Path, output_dir: Path) ->
            realm_id=realm.id,
            emoji_file_name=realm_emoji.file_name
        )
+
+        # Use 'mark_sanitized' to work around false positive caused by Pysa
+        # thinking that 'realm' (and thus 'attachment' and 'attachment.path_id')
+        # are user controlled
+        emoji_path = mark_sanitized(emoji_path)
+
        local_path = os.path.join(local_dir, emoji_path)
        output_path = os.path.join(output_dir, emoji_path)
+
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        shutil.copy2(local_path, output_path)
        # Realm Emoji author is optional.
--- a/zerver/lib/pysa.py
+++ b/zerver/lib/pysa.py
@ -0,0 +1,8 @@
+from typing import TypeVar
+
+
+T = TypeVar("T")
+
+
+def mark_sanitized(arg: T) -> T:
+    return arg
--- a/zerver/lib/unminify.py
+++ b/zerver/lib/unminify.py
@ -4,6 +4,8 @@ import sourcemap

 from typing import Dict, List

+from zerver.lib.pysa import mark_sanitized
+

 class SourceMap:
    '''Map (line, column) pairs from generated to source file.'''
@ -15,11 +17,25 @@ class SourceMap:
    def _index_for(self, minified_src: str) -> sourcemap.SourceMapDecoder:
        '''Return the source map index for minified_src, loading it if not
           already loaded.'''
+
+        # Prevent path traversal
+        assert ".." not in minified_src and "/" not in minified_src
+
        if minified_src not in self._indices:
            for source_dir in self._dirs:
                filename = os.path.join(source_dir, minified_src + '.map')
                if os.path.isfile(filename):
-                    with open(filename) as fp:
+                    # Use 'mark_sanitized' to force Pysa to ignore the fact that
+                    # 'filename' is user controlled. While putting user
+                    # controlled data into a filesystem operation is bad, in
+                    # this case it's benign because 'filename' can't traverse
+                    # directories outside of the pre-configured 'sourcemap_dirs'
+                    # (due to the above assertions) and will always end in
+                    # '.map'. Additionally, the result of this function is used
+                    # for error logging and not returned to the user, so
+                    # controlling the loaded file would not be useful to an
+                    # attacker.
+                    with open(mark_sanitized(filename)) as fp:
                        self._indices[minified_src] = sourcemap.load(fp)
                        break

--- a/zerver/lib/upload.py
+++ b/zerver/lib/upload.py
@ -98,7 +98,9 @@ def sanitize_name(value: str) -> str:
    """
    value = unicodedata.normalize('NFKC', value)
    value = re.sub(r'[^\w\s._-]', '', value, flags=re.U).strip()
-    return mark_safe(re.sub(r'[-\s]+', '-', value, flags=re.U))
+    value = re.sub(r'[-\s]+', '-', value, flags=re.U)
+    assert value not in {'', '.', '..'}
+    return mark_safe(value)

 def random_name(bytes: int=60) -> str:
    return base64.urlsafe_b64encode(os.urandom(bytes)).decode('utf-8')
@ -653,6 +655,7 @@ class S3UploadBackend(ZulipUploadBackend):

 def write_local_file(type: str, path: str, file_data: bytes) -> None:
    file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, type, path)
+
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'wb') as f:
        f.write(file_data)
--- a/zerver/lib/url_encoding.py
+++ b/zerver/lib/url_encoding.py
@ -1,6 +1,7 @@
 import urllib
 from typing import Any, Dict, List

+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.topic import get_topic_from_message_info
 from zerver.models import Realm, Stream, UserProfile

@ -97,8 +98,12 @@ def near_pm_message_url(realm: Realm,
    return full_url

 def add_query_to_redirect_url(original_url: str, query: str) -> str:
-    return original_url + "?" + query
+    # Using 'mark_sanitized' because user-controlled data after the '?' is
+    # not relevant for open redirects
+    return original_url + "?" + mark_sanitized(query)

 def add_query_arg_to_redirect_url(original_url: str, query_arg: str) -> str:
    assert '?' in original_url
-    return original_url + "&" + query_arg
+    # Using 'mark_sanitized' because user-controlled data after the '?' is
+    # not relevant for open redirects
+    return original_url + "&" + mark_sanitized(query_arg)
--- a/zerver/lib/url_preview/preview.py
+++ b/zerver/lib/url_preview/preview.py
@ -9,6 +9,7 @@ from typing.re import Match

 from version import ZULIP_VERSION
 from zerver.lib.cache import cache_with_key, get_cache_with_key, preview_url_cache_key
+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.url_preview.oembed import get_oembed_data
 from zerver.lib.url_preview.parsers import OpenGraphParser, GenericParser

@ -88,7 +89,7 @@ def get_link_embed_data(url: str,
    if data.get('oembed'):
        return data

-    response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT)
+    response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        for key in ['title', 'description', 'image']:
--- a/zerver/models.py
+++ b/zerver/models.py
@ -28,6 +28,7 @@ from zerver.lib.timestamp import datetime_to_timestamp
 from django.db.models.signals import post_save, post_delete
 from django.utils.translation import ugettext_lazy as _
 from zerver.lib import cache
+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.validator import check_int, \
    check_short_string, check_long_string, validate_choice_field, check_date, \
    check_url, check_list
@ -518,7 +519,9 @@ class Realm(models.Model):

    @property
    def host(self) -> str:
-        return self.host_for_subdomain(self.subdomain)
+        # Use mark sanitized to prevent false positives from Pysa thinking that
+        # the host is user controlled.
+        return mark_sanitized(self.host_for_subdomain(self.subdomain))

    @staticmethod
    def host_for_subdomain(subdomain: str) -> str:
--- a/zerver/views/auth.py
+++ b/zerver/views/auth.py
@ -30,6 +30,7 @@ from zerver.forms import HomepageForm, OurAuthenticationForm, \
    AuthenticationTokenForm
 from zerver.lib.mobile_auth_otp import otp_encrypt_api_key
 from zerver.lib.push_notifications import push_notifications_enabled
+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.realm_icon import realm_icon_url
 from zerver.lib.request import REQ, has_request_variables, JsonableError
 from zerver.lib.response import json_success, json_error
@ -64,7 +65,10 @@ ExtraContext = Optional[Dict[str, Any]]
 def get_safe_redirect_to(url: str, redirect_host: str) -> str:
    is_url_safe = is_safe_url(url=url, allowed_hosts=None)
    if is_url_safe:
-        return urllib.parse.urljoin(redirect_host, url)
+        # Mark as safe to prevent Pysa from surfacing false positives for
+        # open redirects. In this branch, we have already checked that the URL
+        # points to the specified 'redirect_host', or is relative.
+        return urllib.parse.urljoin(redirect_host, mark_sanitized(url))
    else:
        return redirect_host

@ -186,7 +190,10 @@ def maybe_send_to_registration(request: HttpRequest, email: str, full_name: str=
            host = realm.host
        else:
            host = request.get_host()
-        confirmation_link = create_confirmation_link(prereg_user, host,
+        # Mark 'host' as safe for use in a redirect. It's pulled from the
+        # current request or realm, both of which only allow a limited set of
+        # trusted hosts.
+        confirmation_link = create_confirmation_link(prereg_user, mark_sanitized(host),
                                                     Confirmation.USER_REGISTRATION)
        if is_signup:
            return redirect(confirmation_link)
--- a/zerver/views/registration.py
+++ b/zerver/views/registration.py
@ -27,6 +27,7 @@ from zerver.decorator import require_post, \
 from zerver.lib.create_user import get_role_for_new_user
 from zerver.lib.onboarding import send_initial_realm_messages, setup_realm_internal_bots
 from zerver.lib.sessions import get_expirable_session_var
+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.subdomains import get_subdomain, is_root_domain_available
 from zerver.lib.timezone import get_all_timezones
 from zerver.lib.url_encoding import add_query_to_redirect_url
@ -408,7 +409,9 @@ def login_and_go_to_home(request: HttpRequest, user_profile: UserProfile) -> Htt
        return finish_desktop_flow(request, user_profile, desktop_flow_otp)

    do_login(request, user_profile)
-    return HttpResponseRedirect(user_profile.realm.uri + reverse('zerver.views.home.home'))
+    # Using 'mark_sanitized' to work around false positive where Pysa thinks
+    # that 'user_profile' is user-controlled
+    return HttpResponseRedirect(mark_sanitized(user_profile.realm.uri) + reverse('zerver.views.home.home'))

 def prepare_activation_url(email: str, request: HttpRequest,
                           realm_creation: bool=False,
--- a/zerver/views/video_calls.py
+++ b/zerver/views/video_calls.py
@ -19,6 +19,7 @@ from requests_oauthlib import OAuth2Session
 from zerver.decorator import REQ, has_request_variables, zulip_login_required
 from zerver.lib.actions import do_set_zoom_token
 from zerver.lib.exceptions import ErrorCode, JsonableError
+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.response import json_success
 from zerver.lib.subdomains import get_subdomain
 from zerver.lib.validator import check_dict, check_string
@ -57,7 +58,10 @@ def get_zoom_sid(request: HttpRequest) -> str:
    # token directly to the Zoom server.

    csrf.get_token(request)
-    return (
+    # Use 'mark_sanitized' to cause Pysa to ignore the flow of user controlled
+    # data out of this function. 'request.META' is indeed user controlled, but
+    # post-HMAC ouptut is no longer meaningfully controllable.
+    return mark_sanitized(
        ""
        if getattr(request, "_dont_enforce_csrf_checks", False)
        else salted_hmac("Zulip Zoom sid", request.META["CSRF_COOKIE"]).hexdigest()
--- a/zerver/views/zephyr.py
+++ b/zerver/views/zephyr.py
@ -3,6 +3,7 @@ from django.http import HttpResponse, HttpRequest
 from django.utils.translation import ugettext as _
 from zerver.decorator import authenticated_json_view
 from zerver.lib.ccache import make_ccache
+from zerver.lib.pysa import mark_sanitized
 from zerver.lib.request import has_request_variables, REQ
 from zerver.lib.response import json_success, json_error
 from zerver.lib.users import get_api_key
@ -43,6 +44,14 @@ def webathena_kerberos_login(request: HttpRequest, user_profile: UserProfile,
        # This is important for security since DNS is not secure.
        assert(re.match(r'^[a-z0-9_.-]+$', user) is not None)
        ccache = make_ccache(parsed_cred)
+
+        # 'user' has been verified to contain only benign characters that won't
+        # help with shell injection.
+        user = mark_sanitized(user)
+
+        # 'ccache' is only written to disk by the script and used as a kerberos
+        # credential cache file.
+        ccache = mark_sanitized(ccache)
    except Exception:
        return json_error(_("Invalid Kerberos cache"))

--- a/zerver/worker/queue_processors.py
+++ b/zerver/worker/queue_processors.py
@ -45,6 +45,7 @@ from zerver.lib.bot_lib import EmbeddedBotHandler, get_bot_handler, EmbeddedBotQ
 from zerver.lib.exceptions import RateLimited
 from zerver.lib.export import export_realm_wrapper
 from zerver.lib.remote_server import PushNotificationBouncerRetryLaterError
+from zerver.lib.pysa import mark_sanitized

 import os
 import ujson
@ -204,7 +205,9 @@ class QueueProcessingWorker(ABC):
        self._log_problem()
        if not os.path.exists(settings.QUEUE_ERROR_DIR):
            os.mkdir(settings.QUEUE_ERROR_DIR)  # nocoverage
-        fname = f'{self.queue_name}.errors'
+        # Use 'mark_sanitized' to prevent Pysa from detecting this false positive
+        # flow. 'queue_name' is always a constant string.
+        fname = mark_sanitized(f'{self.queue_name}.errors')
        fn = os.path.join(settings.QUEUE_ERROR_DIR, fname)
        line = f'{time.asctime()}\t{ujson.dumps(events)}\n'
        lock_fn = fn + '.lock'