Rewrite sanitize_name to better preserve filenames.

The previous version of sanitize_name dropped all unicode characters and mangled filenames with multiple `.`s in the extension, leading to confusing URLs for files uploaded to Zulip. Fixes #321. [tweaked significantly by tabbott]
2016-03-13 14:59:33 +05:30 · 2016-03-13 14:59:33 +05:30 · e0ef1a991e
parent 4a50336476
commit e0ef1a991e
2 changed files with 36 additions and 8 deletions
--- a/zerver/lib/upload.py
+++ b/zerver/lib/upload.py
@ -2,6 +2,9 @@ from __future__ import absolute_import

 from django.conf import settings
 from django.template.defaultfilters import slugify
+from django.utils.encoding import force_text
+from django.utils.safestring import mark_safe
+import unicodedata

 from zerver.lib.avatar import user_avatar_hash

@ -13,6 +16,7 @@ from zerver.models import get_user_profile_by_id

 import base64
 import os
+import re
 from PIL import Image, ImageOps
 from six.moves import cStringIO as StringIO
 import random
@ -28,15 +32,26 @@ import random
 # This is great, because passing the pseudofile object that Django gives
 # you to boto would be a pain.

-# To come up with a s3 key we randomly generate a "directory". The "file
-# name" is the original filename provided by the user run through Django's
-# slugify.
+# To come up with a s3 key we randomly generate a "directory". The
+# "file name" is the original filename provided by the user run
+# through a sanitization function.

-def sanitize_name(name):
-    split_name = name.split('.')
-    base = ".".join(split_name[:-1])
-    extension = split_name[-1]
-    return slugify(base) + "." + slugify(extension)
+def sanitize_name(value):
+    """
+    Sanitizes a value to be safe to store in a Linux filesystem, in
+    S3, and in a URL.  So unicode is allowed, but not special
+    characters other than ".", "-", and "_".
+
+    This implementation is based on django.utils.text.slugify; it is
+    modified by:
+    * hardcoding allow_unicode=True.
+    * adding '.' and '_' to the list of allowed characters.
+    * preserving the case of the value.
+    """
+    value = force_text(value)
+    value = unicodedata.normalize('NFKC', value)
+    value = re.sub('[^\w\s._-]', '', value, flags=re.U).strip()
+    return mark_safe(re.sub('[-\s]+', '-', value, flags=re.U))

 def random_name(bytes=60):
    return base64.urlsafe_b64encode(os.urandom(bytes))
--- a/zerver/test_external.py
+++ b/zerver/test_external.py
@ -17,6 +17,7 @@ from zerver.lib.actions import compute_mit_user_fullname
 from zerver.lib.test_helpers import AuthedTestCase
 from zerver.models import get_user_profile_by_email
 from zerver.lib.test_runner import slow
+from zerver.lib.upload import sanitize_name

 import time
 import ujson
@ -227,3 +228,15 @@ class GCMTokenTests(AuthedTestCase):
        result = self.client.post('/json/users/me/android_gcm_reg_id', {'token':token})
        self.assert_json_success(result)

+class SanitizeNameTests(TestCase):
+    def test_file_name(self):
+        self.assertEquals(sanitize_name(u'test.txt'), u'test.txt')
+        self.assertEquals(sanitize_name(u'.hidden'), u'.hidden')
+        self.assertEquals(sanitize_name(u'.hidden.txt'), u'.hidden.txt')
+        self.assertEquals(sanitize_name(u'tarball.tar.gz'), u'tarball.tar.gz')
+        self.assertEquals(sanitize_name(u'.hidden_tarball.tar.gz'), u'.hidden_tarball.tar.gz')
+        self.assertEquals(sanitize_name(u'Testing{}*&*#().ta&&%$##&&r.gz'), u'Testing.tar.gz')
+        self.assertEquals(sanitize_name(u'*testingfile?*.txt'), u'testingfile.txt')
+        self.assertEquals(sanitize_name(u'snowman☃.txt'), u'snowman.txt')
+        self.assertEquals(sanitize_name(u'테스트.txt'), u'테스트.txt')
+        self.assertEquals(sanitize_name(u'~/."\`\?*"u0`000ssh/test.t**{}ar.gz'), u'.u0000sshtest.tar.gz')