Rewrite sanitize_name to better preserve filenames.

The previous version of sanitize_name dropped all unicode characters
and mangled filenames with multiple `.`s in the extension, leading to
confusing URLs for files uploaded to Zulip.

Fixes #321.

[tweaked significantly by tabbott]
This commit is contained in:
Varshit 2016-03-13 14:59:33 +05:30 committed by Tim Abbott
parent 4a50336476
commit e0ef1a991e
2 changed files with 36 additions and 8 deletions

View File

@ -2,6 +2,9 @@ from __future__ import absolute_import
from django.conf import settings from django.conf import settings
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from django.utils.encoding import force_text
from django.utils.safestring import mark_safe
import unicodedata
from zerver.lib.avatar import user_avatar_hash from zerver.lib.avatar import user_avatar_hash
@ -13,6 +16,7 @@ from zerver.models import get_user_profile_by_id
import base64 import base64
import os import os
import re
from PIL import Image, ImageOps from PIL import Image, ImageOps
from six.moves import cStringIO as StringIO from six.moves import cStringIO as StringIO
import random import random
@ -28,15 +32,26 @@ import random
# This is great, because passing the pseudofile object that Django gives # This is great, because passing the pseudofile object that Django gives
# you to boto would be a pain. # you to boto would be a pain.
# To come up with a s3 key we randomly generate a "directory". The "file # To come up with a s3 key we randomly generate a "directory". The
# name" is the original filename provided by the user run through Django's # "file name" is the original filename provided by the user run
# slugify. # through a sanitization function.
def sanitize_name(name): def sanitize_name(value):
split_name = name.split('.') """
base = ".".join(split_name[:-1]) Sanitizes a value to be safe to store in a Linux filesystem, in
extension = split_name[-1] S3, and in a URL. So unicode is allowed, but not special
return slugify(base) + "." + slugify(extension) characters other than ".", "-", and "_".
This implementation is based on django.utils.text.slugify; it is
modified by:
* hardcoding allow_unicode=True.
* adding '.' and '_' to the list of allowed characters.
* preserving the case of the value.
"""
value = force_text(value)
value = unicodedata.normalize('NFKC', value)
value = re.sub('[^\w\s._-]', '', value, flags=re.U).strip()
return mark_safe(re.sub('[-\s]+', '-', value, flags=re.U))
def random_name(bytes=60): def random_name(bytes=60):
return base64.urlsafe_b64encode(os.urandom(bytes)) return base64.urlsafe_b64encode(os.urandom(bytes))

View File

@ -17,6 +17,7 @@ from zerver.lib.actions import compute_mit_user_fullname
from zerver.lib.test_helpers import AuthedTestCase from zerver.lib.test_helpers import AuthedTestCase
from zerver.models import get_user_profile_by_email from zerver.models import get_user_profile_by_email
from zerver.lib.test_runner import slow from zerver.lib.test_runner import slow
from zerver.lib.upload import sanitize_name
import time import time
import ujson import ujson
@ -227,3 +228,15 @@ class GCMTokenTests(AuthedTestCase):
result = self.client.post('/json/users/me/android_gcm_reg_id', {'token':token}) result = self.client.post('/json/users/me/android_gcm_reg_id', {'token':token})
self.assert_json_success(result) self.assert_json_success(result)
class SanitizeNameTests(TestCase):
def test_file_name(self):
self.assertEquals(sanitize_name(u'test.txt'), u'test.txt')
self.assertEquals(sanitize_name(u'.hidden'), u'.hidden')
self.assertEquals(sanitize_name(u'.hidden.txt'), u'.hidden.txt')
self.assertEquals(sanitize_name(u'tarball.tar.gz'), u'tarball.tar.gz')
self.assertEquals(sanitize_name(u'.hidden_tarball.tar.gz'), u'.hidden_tarball.tar.gz')
self.assertEquals(sanitize_name(u'Testing{}*&*#().ta&&%$##&&r.gz'), u'Testing.tar.gz')
self.assertEquals(sanitize_name(u'*testingfile?*.txt'), u'testingfile.txt')
self.assertEquals(sanitize_name(u'snowman☃.txt'), u'snowman.txt')
self.assertEquals(sanitize_name(u'테스트.txt'), u'테스트.txt')
self.assertEquals(sanitize_name(u'~/."\`\?*"u0`000ssh/test.t**{}ar.gz'), u'.u0000sshtest.tar.gz')