Viewing file: _iri.py (8.43 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
# coding: utf-8
""" Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports the following items:
- iri_to_uri() - uri_to_iri() """
from __future__ import unicode_literals, division, absolute_import, print_function
from encodings import idna # noqa import codecs import re import sys
from ._errors import unwrap from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types
if sys.version_info < (3,): from urlparse import urlsplit, urlunsplit from urllib import ( quote as urlquote, unquote as unquote_to_bytes, )
else: from urllib.parse import ( quote as urlquote, unquote_to_bytes, urlsplit, urlunsplit, )
def iri_to_uri(value): """ Normalizes and encodes a unicode IRI into an ASCII byte string URI
:param value: A unicode string of an IRI
:return: A byte string of the ASCII-encoded URI """
if not isinstance(value, str_cls): raise TypeError(unwrap( ''' value must be a unicode string, not %s ''', type_name(value) ))
scheme = None # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https:// if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'): real_prefix = None prefix_match = re.match('^[^:]*://', value) if prefix_match: real_prefix = prefix_match.group(0) value = 'http://' + value[len(real_prefix):] parsed = urlsplit(value) if real_prefix: value = real_prefix + value[7:] scheme = _urlquote(real_prefix[:-3]) else: parsed = urlsplit(value)
if scheme is None: scheme = _urlquote(parsed.scheme) hostname = parsed.hostname if hostname is not None: hostname = hostname.encode('idna') # RFC 3986 allows userinfo to contain sub-delims username = _urlquote(parsed.username, safe='!$&\'()*+,;=') password = _urlquote(parsed.password, safe='!$&\'()*+,;=') port = parsed.port if port is not None: port = str_cls(port).encode('ascii')
netloc = b'' if username is not None: netloc += username if password: netloc += b':' + password netloc += b'@' if hostname is not None: netloc += hostname if port is not None: default_http = scheme == b'http' and port == b'80' default_https = scheme == b'https' and port == b'443' if not default_http and not default_https: netloc += b':' + port
# RFC 3986 allows a path to contain sub-delims, plus "@" and ":" path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:') # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?" query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:') # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?" fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')
if query is None and fragment is None and path == b'/': path = None
# Python 2.7 compat if path is None: path = ''
output = urlunsplit((scheme, netloc, path, query, fragment)) if isinstance(output, str_cls): output = output.encode('latin1') return output
def uri_to_iri(value): """ Converts an ASCII URI byte string into a unicode IRI
:param value: An ASCII-encoded byte string of the URI
:return: A unicode string of the IRI """
if not isinstance(value, byte_cls): raise TypeError(unwrap( ''' value must be a byte string, not %s ''', type_name(value) ))
parsed = urlsplit(value)
scheme = parsed.scheme if scheme is not None: scheme = scheme.decode('ascii')
username = _urlunquote(parsed.username, remap=[':', '@']) password = _urlunquote(parsed.password, remap=[':', '@']) hostname = parsed.hostname if hostname: hostname = hostname.decode('idna') port = parsed.port if port and not isinstance(port, int_types): port = port.decode('ascii')
netloc = '' if username is not None: netloc += username if password: netloc += ':' + password netloc += '@' if hostname is not None: netloc += hostname if port is not None: netloc += ':' + str_cls(port)
path = _urlunquote(parsed.path, remap=['/'], preserve=True) query = _urlunquote(parsed.query, remap=['&', '='], preserve=True) fragment = _urlunquote(parsed.fragment)
return urlunsplit((scheme, netloc, path, query, fragment))
def _iri_utf8_errors_handler(exc): """ Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte sequences encoded in %XX format, but as part of a unicode string.
:param exc: The UnicodeDecodeError exception
:return: A 2-element tuple of (replacement unicode string, integer index to resume at) """
bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end]) replacements = ['%%%02x' % num for num in bytes_as_ints] return (''.join(replacements), exc.end)
codecs.register_error('iriutf8', _iri_utf8_errors_handler)
def _urlquote(string, safe=''): """ Quotes a unicode string for use in a URL
:param string: A unicode string
:param safe: A unicode string of character to not encode
:return: None (if string is None) or an ASCII byte string of the quoted string """
if string is None or string == '': return None
# Anything already hex quoted is pulled out of the URL and unquoted if # possible escapes = [] if re.search('%[0-9a-fA-F]{2}', string): # Try to unquote any percent values, restoring them if they are not # valid UTF-8. Also, requote any safe chars since encoded versions of # those are functionally different than the unquoted ones. def _try_unescape(match): byte_string = unquote_to_bytes(match.group(0)) unicode_string = byte_string.decode('utf-8', 'iriutf8') for safe_char in list(safe): unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char)) return unicode_string string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)
# Once we have the minimal set of hex quoted values, removed them from # the string so that they are not double quoted def _extract_escape(match): escapes.append(match.group(0).encode('ascii')) return '\x00' string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)
output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8')) if not isinstance(output, byte_cls): output = output.encode('ascii')
# Restore the existing quoted values that we extracted if len(escapes) > 0: def _return_escape(_): return escapes.pop(0) output = re.sub(b'%00', _return_escape, output)
return output
def _urlunquote(byte_string, remap=None, preserve=None): """ Unquotes a URI portion from a byte string into unicode using UTF-8
:param byte_string: A byte string of the data to unquote
:param remap: A list of characters (as unicode) that should be re-mapped to a %XX encoding. This is used when characters are not valid in part of a URL.
:param preserve: A bool - indicates that the chars to be remapped if they occur in non-hex form, should be preserved. E.g. / for URL path.
:return: A unicode string """
if byte_string is None: return byte_string
if byte_string == b'': return ''
if preserve: replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F'] preserve_unmap = {} for char in remap: replacement = replacements.pop(0) preserve_unmap[replacement] = char byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))
byte_string = unquote_to_bytes(byte_string)
if remap: for char in remap: byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))
output = byte_string.decode('utf-8', 'iriutf8')
if preserve: for replacement, original in preserve_unmap.items(): output = output.replace(replacement, original)
return output
|