[Python-checkins] cpython (3.5): Issue #17214: Percent-encode non-ASCII bytes in redirect targets

martin.panter python-checkins at python.org
Mon May 16 04:15:00 EDT 2016


https://hg.python.org/cpython/rev/cb09fdef19f5
changeset:   101371:cb09fdef19f5
branch:      3.5
parent:      101368:52a7f580580c
user:        Martin Panter <vadmium+py at gmail.com>
date:        Mon May 16 01:14:20 2016 +0000
summary:
  Issue #17214: Percent-encode non-ASCII bytes in redirect targets

Some servers send Location header fields with non-ASCII bytes, but "http.
client" requires the request target to be ASCII-encodable, otherwise a
UnicodeEncodeError is raised. Based on patch by Christian Heimes.

Python 2 does not suffer any problem because it allows non-ASCII bytes in the
HTTP request target.

files:
  Lib/test/test_urllib2.py |  35 ++++++++++++++++++++++++++++
  Lib/urllib/request.py    |  12 ++++++++-
  Misc/NEWS                |   6 ++++
  3 files changed, 52 insertions(+), 1 deletions(-)


diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py
--- a/Lib/test/test_urllib2.py
+++ b/Lib/test/test_urllib2.py
@@ -1224,6 +1224,41 @@
         fp = urllib.request.urlopen("http://python.org/path")
         self.assertEqual(fp.geturl(), "http://python.org/path?query")
 
+    def test_redirect_encoding(self):
+        # Some characters in the redirect target may need special handling,
+        # but most ASCII characters should be treated as already encoded
+        class Handler(urllib.request.HTTPHandler):
+            def http_open(self, req):
+                result = self.do_open(self.connection, req)
+                self.last_buf = self.connection.buf
+                # Set up a normal response for the next request
+                self.connection = test_urllib.fakehttp(
+                    b'HTTP/1.1 200 OK\r\n'
+                    b'Content-Length: 3\r\n'
+                    b'\r\n'
+                    b'123'
+                )
+                return result
+        handler = Handler()
+        opener = urllib.request.build_opener(handler)
+        tests = (
+            (b'/p\xC3\xA5-dansk/', b'/p%C3%A5-dansk/'),
+            (b'/spaced%20path/', b'/spaced%20path/'),
+            (b'/spaced path/', b'/spaced%20path/'),
+            (b'/?p\xC3\xA5-dansk', b'/?p%C3%A5-dansk'),
+        )
+        for [location, result] in tests:
+            with self.subTest(repr(location)):
+                handler.connection = test_urllib.fakehttp(
+                    b'HTTP/1.1 302 Redirect\r\n'
+                    b'Location: ' + location + b'\r\n'
+                    b'\r\n'
+                )
+                response = opener.open('http://example.com/')
+                expected = b'GET ' + result + b' '
+                request = handler.last_buf
+                self.assertTrue(request.startswith(expected), repr(request))
+
     def test_proxy(self):
         o = OpenerDirector()
         ph = urllib.request.ProxyHandler(dict(http="proxy.example.com:3128"))
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -91,6 +91,7 @@
 import posixpath
 import re
 import socket
+import string
 import sys
 import time
 import collections
@@ -616,8 +617,12 @@
         # from the user (of urllib.request, in this case).  In practice,
         # essentially all clients do redirect in this case, so we do
         # the same.
-        # be conciliant with URIs containing a space
+
+        # Be conciliant with URIs containing a space.  This is mainly
+        # redundant with the more complete encoding done in http_error_302(),
+        # but it is kept for compatibility with other callers.
         newurl = newurl.replace(' ', '%20')
+
         CONTENT_HEADERS = ("content-length", "content-type")
         newheaders = dict((k, v) for k, v in req.headers.items()
                           if k.lower() not in CONTENT_HEADERS)
@@ -657,6 +662,11 @@
             urlparts[2] = "/"
         newurl = urlunparse(urlparts)
 
+        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
+        # original bytes and percent-encode non-ASCII bytes, and any special
+        # characters such as the space.
+        newurl = quote(
+            newurl, encoding="iso-8859-1", safe=string.punctuation)
         newurl = urljoin(req.full_url, newurl)
 
         # XXX Probably want to forget about the state of the current
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -121,6 +121,12 @@
 - Issue #14132: Fix urllib.request redirect handling when the target only has
   a query string.  Original fix by Ján Janech.
 
+- Issue #17214: The "urllib.request" module now percent-encodes non-ASCII
+  bytes found in redirect target URLs.  Some servers send Location header
+  fields with non-ASCII bytes, but "http.client" requires the request target
+  to be ASCII-encodable, otherwise a UnicodeEncodeError is raised.  Based on
+  patch by Christian Heimes.
+
 - Issue #26892: Honor debuglevel flag in urllib.request.HTTPHandler. Patch
   contributed by Chi Hsuan Yen.
 

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list