pysa: Introduce sanitizers, models, and inline marking safe.

This commit adds three `.pysa` model files: `false_positives.pysa`
for ruling out false positive flows with `Sanitize` annotations,
`req_lib.pysa` for educating pysa about Zulip's `REQ()` pattern for
extracting user input, and `redirects.pysa` for capturing the risk
of open redirects within Zulip code. Additionally, this commit
introduces `mark_sanitized`, an identity function which can be used
to selectively clear taint in cases where `Sanitize` models will not
work. This commit also puts `mark_sanitized` to work removing known
false postive flows.
This commit is contained in:
Graham Bleaney 2019-12-19 18:00:45 -05:00 committed by Tim Abbott
parent 89131bfcbb
commit 461d5b1a3e
15 changed files with 166 additions and 16 deletions

View File

@ -0,0 +1,59 @@
# This function ensures that a redirect is only within the specified domain.
# Assuming that the domain isn't attacker controllable, the result is safe to
# redirect to
def zerver.views.auth.get_safe_redirect_to(url, redirect_host) -> Sanitize: ...
# This function was previously the source of an open redirect, but has now been
# reviewed and patched, so the output should now be safe to redirect to,
# regardless of the value of the specified 'path'.
def zerver.lib.thumbnail.generate_thumbnail_url(
path,
size=...,
is_camo_url=...
) -> Sanitize: ...
# This function returns a version of name that only contains word and space
# characters, or ., -, _ characters. This should be safe to put into URLs and
# filesystem operations.
def zerver.lib.upload.sanitize_name(value) -> Sanitize: ...
# This function accepts two integers and then concatenates them into a path
# segment. The result should be safe for use in filesystem and other operations.
def zerver.lib.avatar_hash.user_avatar_path_from_ids(user_profile_id, realm_id) -> Sanitize: ...
# This function creates a list of 'UserMessageLite' objects, which contain only
# integral IDs and flags. These should safe for use with SQL and other
# operations.
def zerver.lib.actions.create_user_messages(
message,
um_eligible_user_ids,
long_term_idle_user_ids,
stream_push_user_ids,
stream_email_user_ids,
mentioned_user_ids,
mark_as_read
) -> Sanitize: ...
# This function is an identity function used for removing taint from variables
# when there is no convenient way to do it by annotating existing functions.
def zerver.lib.pysa.mark_sanitized(arg) -> Sanitize: ...
############################
# Overbroad Approximations #
############################
# Note that the below functions are overbroad approximations of Sanitizers and
# could lead to false negatives. They should be replaced with more specific
# feature-based filtering when that is available through SAPP.
# This function generates a URL pointing to a valid Django endpoint, with
# arguments properly URL encoded. The resulting URL can usually be used as a
# part of a redirect or HTTP request without fear of open redirect or SSRF
# vulnerabilities respectively.
def django.urls.base.reverse(
viewname,
urlconf=...,
args=...,
kwargs=...,
current_app=...
) -> Sanitize: ...

4
stubs/taint/req_lib.pysa Normal file
View File

@ -0,0 +1,4 @@
# One of the ways user-controlled data enters the application is through the
# request variables framework. This model teaches Pysa that every instance of
# 'REQ()' in a view function is a source of UserControlled taint.
class zerver.lib.request._REQ(TaintSource[UserControlled]): ...

View File

@ -50,6 +50,7 @@ from zerver.lib.message import (
truncate_body,
truncate_topic,
)
from zerver.lib.pysa import mark_sanitized
from zerver.lib.realm_icon import realm_icon_url
from zerver.lib.realm_logo import get_realm_logo_data
from zerver.lib.retention import move_messages_to_archive
@ -5146,6 +5147,11 @@ def check_add_realm_emoji(realm: Realm,
realm_emoji.save()
emoji_file_name = get_emoji_file_name(image_file.name, realm_emoji.id)
# The only user-controlled portion of 'emoji_file_name' is an extension,
# which can not contain '..' or '/' or '\', making it difficult to exploit
emoji_file_name = mark_sanitized(emoji_file_name)
emoji_uploaded_successfully = False
try:
upload_emoji_image(image_file, emoji_file_name, author)

View File

@ -31,6 +31,8 @@ from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \
CustomProfileFieldValue, get_display_recipient, Attachment, get_system_bot, \
RealmAuditLog, UserHotspot, MutedTopic, Service, UserGroup, \
UserGroupMembership, BotStorageData, BotConfigData
from zerver.lib.pysa import mark_sanitized
import zerver.lib.upload
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, \
Union
@ -1239,7 +1241,11 @@ def _save_s3_object_to_file(key: ServiceResource, output_dir: str, processing_av
if "../" in filename:
raise AssertionError(f"Suspicious file with invalid format {filename}")
dirname = os.path.dirname(filename)
# Use 'mark_sanitized' to cause Pysa to ignore the flow of user controlled
# data into the filesystem sink, because we've already prevented directory
# traversal with our assertion above.
dirname = mark_sanitized(os.path.dirname(filename))
if not os.path.exists(dirname):
os.makedirs(dirname)
key.download_file(filename)
@ -1303,16 +1309,22 @@ def export_uploads_from_local(realm: Realm, local_dir: Path, output_dir: Path) -
count = 0
records = []
for attachment in Attachment.objects.filter(realm_id=realm.id):
local_path = os.path.join(local_dir, attachment.path_id)
output_path = os.path.join(output_dir, attachment.path_id)
# Use 'mark_sanitized' to work around false positive caused by Pysa
# thinking that 'realm' (and thus 'attachment' and 'attachment.path_id')
# are user controlled
path_id = mark_sanitized(attachment.path_id)
local_path = os.path.join(local_dir, path_id)
output_path = os.path.join(output_dir, path_id)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
shutil.copy2(local_path, output_path)
stat = os.stat(local_path)
record = dict(realm_id=attachment.realm_id,
user_profile_id=attachment.owner.id,
user_profile_email=attachment.owner.email,
s3_path=attachment.path_id,
path=attachment.path_id,
s3_path=path_id,
path=path_id,
size=stat.st_size,
last_modified=stat.st_mtime,
content_type=None)
@ -1398,8 +1410,15 @@ def export_emoji_from_local(realm: Realm, local_dir: Path, output_dir: Path) ->
realm_id=realm.id,
emoji_file_name=realm_emoji.file_name
)
# Use 'mark_sanitized' to work around false positive caused by Pysa
# thinking that 'realm' (and thus 'attachment' and 'attachment.path_id')
# are user controlled
emoji_path = mark_sanitized(emoji_path)
local_path = os.path.join(local_dir, emoji_path)
output_path = os.path.join(output_dir, emoji_path)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
shutil.copy2(local_path, output_path)
# Realm Emoji author is optional.

8
zerver/lib/pysa.py Normal file
View File

@ -0,0 +1,8 @@
from typing import TypeVar
T = TypeVar("T")
def mark_sanitized(arg: T) -> T:
return arg

View File

@ -4,6 +4,8 @@ import sourcemap
from typing import Dict, List
from zerver.lib.pysa import mark_sanitized
class SourceMap:
'''Map (line, column) pairs from generated to source file.'''
@ -15,11 +17,25 @@ class SourceMap:
def _index_for(self, minified_src: str) -> sourcemap.SourceMapDecoder:
'''Return the source map index for minified_src, loading it if not
already loaded.'''
# Prevent path traversal
assert ".." not in minified_src and "/" not in minified_src
if minified_src not in self._indices:
for source_dir in self._dirs:
filename = os.path.join(source_dir, minified_src + '.map')
if os.path.isfile(filename):
with open(filename) as fp:
# Use 'mark_sanitized' to force Pysa to ignore the fact that
# 'filename' is user controlled. While putting user
# controlled data into a filesystem operation is bad, in
# this case it's benign because 'filename' can't traverse
# directories outside of the pre-configured 'sourcemap_dirs'
# (due to the above assertions) and will always end in
# '.map'. Additionally, the result of this function is used
# for error logging and not returned to the user, so
# controlling the loaded file would not be useful to an
# attacker.
with open(mark_sanitized(filename)) as fp:
self._indices[minified_src] = sourcemap.load(fp)
break

View File

@ -98,7 +98,9 @@ def sanitize_name(value: str) -> str:
"""
value = unicodedata.normalize('NFKC', value)
value = re.sub(r'[^\w\s._-]', '', value, flags=re.U).strip()
return mark_safe(re.sub(r'[-\s]+', '-', value, flags=re.U))
value = re.sub(r'[-\s]+', '-', value, flags=re.U)
assert value not in {'', '.', '..'}
return mark_safe(value)
def random_name(bytes: int=60) -> str:
return base64.urlsafe_b64encode(os.urandom(bytes)).decode('utf-8')
@ -653,6 +655,7 @@ class S3UploadBackend(ZulipUploadBackend):
def write_local_file(type: str, path: str, file_data: bytes) -> None:
file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, type, path)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'wb') as f:
f.write(file_data)

View File

@ -1,6 +1,7 @@
import urllib
from typing import Any, Dict, List
from zerver.lib.pysa import mark_sanitized
from zerver.lib.topic import get_topic_from_message_info
from zerver.models import Realm, Stream, UserProfile
@ -97,8 +98,12 @@ def near_pm_message_url(realm: Realm,
return full_url
def add_query_to_redirect_url(original_url: str, query: str) -> str:
return original_url + "?" + query
# Using 'mark_sanitized' because user-controlled data after the '?' is
# not relevant for open redirects
return original_url + "?" + mark_sanitized(query)
def add_query_arg_to_redirect_url(original_url: str, query_arg: str) -> str:
assert '?' in original_url
return original_url + "&" + query_arg
# Using 'mark_sanitized' because user-controlled data after the '?' is
# not relevant for open redirects
return original_url + "&" + mark_sanitized(query_arg)

View File

@ -9,6 +9,7 @@ from typing.re import Match
from version import ZULIP_VERSION
from zerver.lib.cache import cache_with_key, get_cache_with_key, preview_url_cache_key
from zerver.lib.pysa import mark_sanitized
from zerver.lib.url_preview.oembed import get_oembed_data
from zerver.lib.url_preview.parsers import OpenGraphParser, GenericParser
@ -88,7 +89,7 @@ def get_link_embed_data(url: str,
if data.get('oembed'):
return data
response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT)
response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)
if response.ok:
og_data = OpenGraphParser(response.text).extract_data()
for key in ['title', 'description', 'image']:

View File

@ -28,6 +28,7 @@ from zerver.lib.timestamp import datetime_to_timestamp
from django.db.models.signals import post_save, post_delete
from django.utils.translation import ugettext_lazy as _
from zerver.lib import cache
from zerver.lib.pysa import mark_sanitized
from zerver.lib.validator import check_int, \
check_short_string, check_long_string, validate_choice_field, check_date, \
check_url, check_list
@ -518,7 +519,9 @@ class Realm(models.Model):
@property
def host(self) -> str:
return self.host_for_subdomain(self.subdomain)
# Use mark sanitized to prevent false positives from Pysa thinking that
# the host is user controlled.
return mark_sanitized(self.host_for_subdomain(self.subdomain))
@staticmethod
def host_for_subdomain(subdomain: str) -> str:

View File

@ -30,6 +30,7 @@ from zerver.forms import HomepageForm, OurAuthenticationForm, \
AuthenticationTokenForm
from zerver.lib.mobile_auth_otp import otp_encrypt_api_key
from zerver.lib.push_notifications import push_notifications_enabled
from zerver.lib.pysa import mark_sanitized
from zerver.lib.realm_icon import realm_icon_url
from zerver.lib.request import REQ, has_request_variables, JsonableError
from zerver.lib.response import json_success, json_error
@ -64,7 +65,10 @@ ExtraContext = Optional[Dict[str, Any]]
def get_safe_redirect_to(url: str, redirect_host: str) -> str:
is_url_safe = is_safe_url(url=url, allowed_hosts=None)
if is_url_safe:
return urllib.parse.urljoin(redirect_host, url)
# Mark as safe to prevent Pysa from surfacing false positives for
# open redirects. In this branch, we have already checked that the URL
# points to the specified 'redirect_host', or is relative.
return urllib.parse.urljoin(redirect_host, mark_sanitized(url))
else:
return redirect_host
@ -186,7 +190,10 @@ def maybe_send_to_registration(request: HttpRequest, email: str, full_name: str=
host = realm.host
else:
host = request.get_host()
confirmation_link = create_confirmation_link(prereg_user, host,
# Mark 'host' as safe for use in a redirect. It's pulled from the
# current request or realm, both of which only allow a limited set of
# trusted hosts.
confirmation_link = create_confirmation_link(prereg_user, mark_sanitized(host),
Confirmation.USER_REGISTRATION)
if is_signup:
return redirect(confirmation_link)

View File

@ -27,6 +27,7 @@ from zerver.decorator import require_post, \
from zerver.lib.create_user import get_role_for_new_user
from zerver.lib.onboarding import send_initial_realm_messages, setup_realm_internal_bots
from zerver.lib.sessions import get_expirable_session_var
from zerver.lib.pysa import mark_sanitized
from zerver.lib.subdomains import get_subdomain, is_root_domain_available
from zerver.lib.timezone import get_all_timezones
from zerver.lib.url_encoding import add_query_to_redirect_url
@ -408,7 +409,9 @@ def login_and_go_to_home(request: HttpRequest, user_profile: UserProfile) -> Htt
return finish_desktop_flow(request, user_profile, desktop_flow_otp)
do_login(request, user_profile)
return HttpResponseRedirect(user_profile.realm.uri + reverse('zerver.views.home.home'))
# Using 'mark_sanitized' to work around false positive where Pysa thinks
# that 'user_profile' is user-controlled
return HttpResponseRedirect(mark_sanitized(user_profile.realm.uri) + reverse('zerver.views.home.home'))
def prepare_activation_url(email: str, request: HttpRequest,
realm_creation: bool=False,

View File

@ -19,6 +19,7 @@ from requests_oauthlib import OAuth2Session
from zerver.decorator import REQ, has_request_variables, zulip_login_required
from zerver.lib.actions import do_set_zoom_token
from zerver.lib.exceptions import ErrorCode, JsonableError
from zerver.lib.pysa import mark_sanitized
from zerver.lib.response import json_success
from zerver.lib.subdomains import get_subdomain
from zerver.lib.validator import check_dict, check_string
@ -57,7 +58,10 @@ def get_zoom_sid(request: HttpRequest) -> str:
# token directly to the Zoom server.
csrf.get_token(request)
return (
# Use 'mark_sanitized' to cause Pysa to ignore the flow of user controlled
# data out of this function. 'request.META' is indeed user controlled, but
# post-HMAC ouptut is no longer meaningfully controllable.
return mark_sanitized(
""
if getattr(request, "_dont_enforce_csrf_checks", False)
else salted_hmac("Zulip Zoom sid", request.META["CSRF_COOKIE"]).hexdigest()

View File

@ -3,6 +3,7 @@ from django.http import HttpResponse, HttpRequest
from django.utils.translation import ugettext as _
from zerver.decorator import authenticated_json_view
from zerver.lib.ccache import make_ccache
from zerver.lib.pysa import mark_sanitized
from zerver.lib.request import has_request_variables, REQ
from zerver.lib.response import json_success, json_error
from zerver.lib.users import get_api_key
@ -43,6 +44,14 @@ def webathena_kerberos_login(request: HttpRequest, user_profile: UserProfile,
# This is important for security since DNS is not secure.
assert(re.match(r'^[a-z0-9_.-]+$', user) is not None)
ccache = make_ccache(parsed_cred)
# 'user' has been verified to contain only benign characters that won't
# help with shell injection.
user = mark_sanitized(user)
# 'ccache' is only written to disk by the script and used as a kerberos
# credential cache file.
ccache = mark_sanitized(ccache)
except Exception:
return json_error(_("Invalid Kerberos cache"))

View File

@ -45,6 +45,7 @@ from zerver.lib.bot_lib import EmbeddedBotHandler, get_bot_handler, EmbeddedBotQ
from zerver.lib.exceptions import RateLimited
from zerver.lib.export import export_realm_wrapper
from zerver.lib.remote_server import PushNotificationBouncerRetryLaterError
from zerver.lib.pysa import mark_sanitized
import os
import ujson
@ -204,7 +205,9 @@ class QueueProcessingWorker(ABC):
self._log_problem()
if not os.path.exists(settings.QUEUE_ERROR_DIR):
os.mkdir(settings.QUEUE_ERROR_DIR) # nocoverage
fname = f'{self.queue_name}.errors'
# Use 'mark_sanitized' to prevent Pysa from detecting this false positive
# flow. 'queue_name' is always a constant string.
fname = mark_sanitized(f'{self.queue_name}.errors')
fn = os.path.join(settings.QUEUE_ERROR_DIR, fname)
line = f'{time.asctime()}\t{ujson.dumps(events)}\n'
lock_fn = fn + '.lock'