Rewrite sanitize_name to better preserve filenames.

The previous version of sanitize_name dropped all unicode characters
and mangled filenames with multiple `.`s in the extension, leading to
confusing URLs for files uploaded to Zulip.

Fixes #321.

[tweaked significantly by tabbott]
This commit is contained in:
Varshit 2016-03-13 14:59:33 +05:30 committed by Tim Abbott
parent 4a50336476
commit e0ef1a991e
2 changed files with 36 additions and 8 deletions

View File

@ -2,6 +2,9 @@ from __future__ import absolute_import
from django.conf import settings
from django.template.defaultfilters import slugify
from django.utils.encoding import force_text
from django.utils.safestring import mark_safe
import unicodedata
from zerver.lib.avatar import user_avatar_hash
@ -13,6 +16,7 @@ from zerver.models import get_user_profile_by_id
import base64
import os
import re
from PIL import Image, ImageOps
from six.moves import cStringIO as StringIO
import random
@ -28,15 +32,26 @@ import random
# This is great, because passing the pseudofile object that Django gives
# you to boto would be a pain.
# To come up with a s3 key we randomly generate a "directory". The "file
# name" is the original filename provided by the user run through Django's
# slugify.
# To come up with a s3 key we randomly generate a "directory". The
# "file name" is the original filename provided by the user run
# through a sanitization function.
def sanitize_name(name):
split_name = name.split('.')
base = ".".join(split_name[:-1])
extension = split_name[-1]
return slugify(base) + "." + slugify(extension)
def sanitize_name(value):
"""
Sanitizes a value to be safe to store in a Linux filesystem, in
S3, and in a URL. So unicode is allowed, but not special
characters other than ".", "-", and "_".
This implementation is based on django.utils.text.slugify; it is
modified by:
* hardcoding allow_unicode=True.
* adding '.' and '_' to the list of allowed characters.
* preserving the case of the value.
"""
value = force_text(value)
value = unicodedata.normalize('NFKC', value)
value = re.sub('[^\w\s._-]', '', value, flags=re.U).strip()
return mark_safe(re.sub('[-\s]+', '-', value, flags=re.U))
def random_name(bytes=60):
return base64.urlsafe_b64encode(os.urandom(bytes))

View File

@ -17,6 +17,7 @@ from zerver.lib.actions import compute_mit_user_fullname
from zerver.lib.test_helpers import AuthedTestCase
from zerver.models import get_user_profile_by_email
from zerver.lib.test_runner import slow
from zerver.lib.upload import sanitize_name
import time
import ujson
@ -227,3 +228,15 @@ class GCMTokenTests(AuthedTestCase):
result = self.client.post('/json/users/me/android_gcm_reg_id', {'token':token})
self.assert_json_success(result)
class SanitizeNameTests(TestCase):
def test_file_name(self):
self.assertEquals(sanitize_name(u'test.txt'), u'test.txt')
self.assertEquals(sanitize_name(u'.hidden'), u'.hidden')
self.assertEquals(sanitize_name(u'.hidden.txt'), u'.hidden.txt')
self.assertEquals(sanitize_name(u'tarball.tar.gz'), u'tarball.tar.gz')
self.assertEquals(sanitize_name(u'.hidden_tarball.tar.gz'), u'.hidden_tarball.tar.gz')
self.assertEquals(sanitize_name(u'Testing{}*&*#().ta&&%$##&&r.gz'), u'Testing.tar.gz')
self.assertEquals(sanitize_name(u'*testingfile?*.txt'), u'testingfile.txt')
self.assertEquals(sanitize_name(u'snowman☃.txt'), u'snowman.txt')
self.assertEquals(sanitize_name(u'테스트.txt'), u'테스트.txt')
self.assertEquals(sanitize_name(u'~/."\`\?*"u0`000ssh/test.t**{}ar.gz'), u'.u0000sshtest.tar.gz')