# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) import stringprep, re, codecs from unicodedata import ucd_3_2_0 as unicodedata # IDNA section 3.1 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") # IDNA section 5 ace_prefix = b"xn--" sace_prefix = "xn--" # This assumes query strings, so AllowUnassigned is true def nameprep(label): # Map newlabel = [] for c in label: if stringprep.in_table_b1(c): # Map to nothing continue newlabel.append(stringprep.map_table_b2(c)) label = "".join(newlabel) # Normalize label = unicodedata.normalize("NFKC", label) # Prohibit for c in label: if stringprep.in_table_c12(c) or \ stringprep.in_table_c22(c) or \ stringprep.in_table_c3(c) or \ stringprep.in_table_c4(c) or \ stringprep.in_table_c5(c) or \ stringprep.in_table_c6(c) or \ stringprep.in_table_c7(c) or \ stringprep.in_table_c8(c) or \ stringprep.in_table_c9(c): raise UnicodeError("Invalid character %r" % c) # Check bidi RandAL = [stringprep.in_table_d1(x) for x in label] for c in RandAL: if c: # There is a RandAL char in the string. Must perform further # tests: # 1) The characters in section 5.8 MUST be prohibited. # This is table C.8, which was already checked # 2) If a string contains any RandALCat character, the string # MUST NOT contain any LCat character. if any(stringprep.in_table_d2(x) for x in label): raise UnicodeError("Violation of BIDI requirement 2") # 3) If a string contains any RandALCat character, a # RandALCat character MUST be the first character of the # string, and a RandALCat character MUST be the last # character of the string. if not RandAL[0] or not RandAL[-1]: raise UnicodeError("Violation of BIDI requirement 3") return label def ToASCII(label): try: # Step 1: try ASCII label = label.encode("ascii") except UnicodeError: pass else: # Skip to step 3: UseSTD3ASCIIRules is false, so # Skip to step 8. if 0 < len(label) < 64: return label raise UnicodeError("label empty or too long") # Step 2: nameprep label = nameprep(label) # Step 3: UseSTD3ASCIIRules is false # Step 4: try ASCII try: label = label.encode("ascii") except UnicodeError: pass else: # Skip to step 8. if 0 < len(label) < 64: return label raise UnicodeError("label empty or too long") # Step 5: Check ACE prefix if label.startswith(sace_prefix): raise UnicodeError("Label starts with ACE prefix") # Step 6: Encode with PUNYCODE label = label.encode("punycode") # Step 7: Prepend ACE prefix label = ace_prefix + label # Step 8: Check size if 0 < len(label) < 64: return label raise UnicodeError("label empty or too long") def ToUnicode(label): # Step 1: Check for ASCII if isinstance(label, bytes): pure_ascii = True else: try: label = label.encode("ascii") pure_ascii = True except UnicodeError: pure_ascii = False if not pure_ascii: # Step 2: Perform nameprep label = nameprep(label) # It doesn't say this, but apparently, it should be ASCII now try: label = label.encode("ascii") except UnicodeError: raise UnicodeError("Invalid character in IDN label") # Step 3: Check for ACE prefix if not label.startswith(ace_prefix): return str(label, "ascii") # Step 4: Remove ACE prefix label1 = label[len(ace_prefix):] # Step 5: Decode using PUNYCODE result = label1.decode("punycode") # Step 6: Apply ToASCII label2 = ToASCII(result) # Step 7: Compare the result of step 6 with the one of step 3 # label2 will already be in lower case. if str(label, "ascii").lower() != str(label2, "ascii"): raise UnicodeError("IDNA does not round-trip", label, label2) # Step 8: return the result of step 5 return result ### Codec APIs class Codec(codecs.Codec): def encode(self, input, errors='strict'): if errors != 'strict': # IDNA is quite clear that implementations must be strict raise UnicodeError("unsupported error handling "+errors) if not input: return b'', 0 try: result = input.encode('ascii') except UnicodeEncodeError: pass else: # ASCII name: fast path labels = result.split(b'.') for label in labels[:-1]: if not (0 < len(label) < 64): raise UnicodeError("label empty or too long") if len(labels[-1]) >= 64: raise UnicodeError("label too long") return result, len(input) result = bytearray() labels = dots.split(input) if labels and not labels[-1]: trailing_dot = b'.' del labels[-1] else: trailing_dot = b'' for label in labels: if result: # Join with U+002E result.extend(b'.') result.extend(ToASCII(label)) return bytes(result+trailing_dot), len(input) def decode(self, input, errors='strict'): if errors != 'strict': raise UnicodeError("Unsupported error handling "+errors) if not input: return "", 0 # IDNA allows decoding to operate on Unicode strings, too. if not isinstance(input, bytes): # XXX obviously wrong, see #3232 input = bytes(input) if ace_prefix not in input: # Fast path try: return input.decode('ascii'), len(input) except UnicodeDecodeError: pass labels = input.split(b".") if labels and len(labels[-1]) == 0: trailing_dot = '.' del labels[-1] else: trailing_dot = '' result = [] for label in labels: result.append(ToUnicode(label)) return ".".join(result)+trailing_dot, len(input) class IncrementalEncoder(codecs.BufferedIncrementalEncoder): def _buffer_encode(self, input, errors, final): if errors != 'strict': # IDNA is quite clear that implementations must be strict raise UnicodeError("unsupported error handling "+errors) if not input: return (b'', 0) labels = dots.split(input) trailing_dot = b'' if labels: if not labels[-1]: trailing_dot = b'.' del labels[-1] elif not final: # Keep potentially unfinished label until the next call del labels[-1] if labels: trailing_dot = b'.' result = bytearray() size = 0 for label in labels: if size: # Join with U+002E result.extend(b'.') size += 1 result.extend(ToASCII(label)) size += len(label) result += trailing_dot size += len(trailing_dot) return (bytes(result), size) class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def _buffer_decode(self, input, errors, final): if errors != 'strict': raise UnicodeError("Unsupported error handling "+errors) if not input: return ("", 0) # IDNA allows decoding to operate on Unicode strings, too. if isinstance(input, str): labels = dots.split(input) else: # Must be ASCII string input = str(input, "ascii") labels = input.split(".") trailing_dot = '' if labels: if not labels[-1]: trailing_dot = '.' del labels[-1] elif not final: # Keep potentially unfinished label until the next call del labels[-1] if labels: trailing_dot = '.' result = [] size = 0 for label in labels: result.append(ToUnicode(label)) if size: size += 1 size += len(label) result = ".".join(result) + trailing_dot size += len(trailing_dot) return (result, size) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): pass ### encodings module API def getregentry(): return codecs.CodecInfo( name='idna', encode=Codec().encode, decode=Codec().decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, )
Name | Type | Size | Permission | Actions |
---|---|---|---|---|
__pycache__ | Folder | 0755 |
|
|
__init__.py | File | 5.46 KB | 0644 |
|
aliases.py | File | 15.33 KB | 0644 |
|
ascii.py | File | 1.22 KB | 0644 |
|
base64_codec.py | File | 1.5 KB | 0644 |
|
big5.py | File | 1019 B | 0644 |
|
big5hkscs.py | File | 1.01 KB | 0644 |
|
bz2_codec.py | File | 2.2 KB | 0644 |
|
charmap.py | File | 2.04 KB | 0644 |
|
cp037.py | File | 12.81 KB | 0644 |
|
cp1006.py | File | 13.25 KB | 0644 |
|
cp1026.py | File | 12.81 KB | 0644 |
|
cp1125.py | File | 33.79 KB | 0644 |
|
cp1140.py | File | 12.8 KB | 0644 |
|
cp1250.py | File | 13.37 KB | 0644 |
|
cp1251.py | File | 13.05 KB | 0644 |
|
cp1252.py | File | 13.19 KB | 0644 |
|
cp1253.py | File | 12.79 KB | 0644 |
|
cp1254.py | File | 13.19 KB | 0644 |
|
cp1255.py | File | 12.17 KB | 0644 |
|
cp1256.py | File | 12.51 KB | 0644 |
|
cp1257.py | File | 13.06 KB | 0644 |
|
cp1258.py | File | 13.05 KB | 0644 |
|
cp273.py | File | 13.8 KB | 0644 |
|
cp424.py | File | 11.77 KB | 0644 |
|
cp437.py | File | 33.75 KB | 0644 |
|
cp500.py | File | 12.81 KB | 0644 |
|
cp720.py | File | 13.37 KB | 0644 |
|
cp737.py | File | 33.87 KB | 0644 |
|
cp775.py | File | 33.67 KB | 0644 |
|
cp850.py | File | 33.31 KB | 0644 |
|
cp852.py | File | 34.18 KB | 0644 |
|
cp855.py | File | 33.06 KB | 0644 |
|
cp856.py | File | 12.13 KB | 0644 |
|
cp857.py | File | 33.11 KB | 0644 |
|
cp858.py | File | 33.22 KB | 0644 |
|
cp860.py | File | 33.87 KB | 0644 |
|
cp861.py | File | 33.82 KB | 0644 |
|
cp862.py | File | 32.59 KB | 0644 |
|
cp863.py | File | 33.45 KB | 0644 |
|
cp864.py | File | 32.87 KB | 0644 |
|
cp865.py | File | 33.81 KB | 0644 |
|
cp866.py | File | 33.59 KB | 0644 |
|
cp869.py | File | 32.19 KB | 0644 |
|
cp874.py | File | 12.3 KB | 0644 |
|
cp875.py | File | 12.55 KB | 0644 |
|
cp932.py | File | 1023 B | 0644 |
|
cp949.py | File | 1023 B | 0644 |
|
cp950.py | File | 1023 B | 0644 |
|
euc_jis_2004.py | File | 1.03 KB | 0644 |
|
euc_jisx0213.py | File | 1.03 KB | 0644 |
|
euc_jp.py | File | 1 KB | 0644 |
|
euc_kr.py | File | 1 KB | 0644 |
|
gb18030.py | File | 1.01 KB | 0644 |
|
gb2312.py | File | 1 KB | 0644 |
|
gbk.py | File | 1015 B | 0644 |
|
hex_codec.py | File | 1.47 KB | 0644 |
|
hp_roman8.py | File | 13.16 KB | 0644 |
|
hz.py | File | 1011 B | 0644 |
|
idna.py | File | 8.96 KB | 0644 |
|
iso2022_jp.py | File | 1.03 KB | 0644 |
|
iso2022_jp_1.py | File | 1.04 KB | 0644 |
|
iso2022_jp_2.py | File | 1.04 KB | 0644 |
|
iso2022_jp_2004.py | File | 1.05 KB | 0644 |
|
iso2022_jp_3.py | File | 1.04 KB | 0644 |
|
iso2022_jp_ext.py | File | 1.04 KB | 0644 |
|
iso2022_kr.py | File | 1.03 KB | 0644 |
|
iso8859_1.py | File | 12.87 KB | 0644 |
|
iso8859_10.py | File | 13.27 KB | 0644 |
|
iso8859_11.py | File | 12.05 KB | 0644 |
|
iso8859_13.py | File | 12.96 KB | 0644 |
|
iso8859_14.py | File | 13.33 KB | 0644 |
|
iso8859_15.py | File | 12.9 KB | 0644 |
|
iso8859_16.py | File | 13.24 KB | 0644 |
|
iso8859_2.py | File | 13.09 KB | 0644 |
|
iso8859_3.py | File | 12.78 KB | 0644 |
|
iso8859_4.py | File | 13.06 KB | 0644 |
|
iso8859_5.py | File | 12.71 KB | 0644 |
|
iso8859_6.py | File | 10.58 KB | 0644 |
|
iso8859_7.py | File | 12.54 KB | 0644 |
|
iso8859_8.py | File | 10.78 KB | 0644 |
|
iso8859_9.py | File | 12.85 KB | 0644 |
|
johab.py | File | 1023 B | 0644 |
|
koi8_r.py | File | 13.46 KB | 0644 |
|
koi8_t.py | File | 12.88 KB | 0644 |
|
koi8_u.py | File | 13.44 KB | 0644 |
|
kz1048.py | File | 13.4 KB | 0644 |
|
latin_1.py | File | 1.23 KB | 0644 |
|
mac_arabic.py | File | 35.61 KB | 0644 |
|
mac_centeuro.py | File | 13.77 KB | 0644 |
|
mac_croatian.py | File | 13.31 KB | 0644 |
|
mac_cyrillic.py | File | 13.14 KB | 0644 |
|
mac_farsi.py | File | 14.81 KB | 0644 |
|
mac_greek.py | File | 13.4 KB | 0644 |
|
mac_iceland.py | File | 13.18 KB | 0644 |
|
mac_latin2.py | File | 13.79 KB | 0644 |
|
mac_roman.py | File | 13.16 KB | 0644 |
|
mac_romanian.py | File | 13.34 KB | 0644 |
|
mac_turkish.py | File | 13.2 KB | 0644 |
|
mbcs.py | File | 1.18 KB | 0644 |
|
oem.py | File | 1019 B | 0644 |
|
palmos.py | File | 13.2 KB | 0644 |
|
ptcp154.py | File | 13.69 KB | 0644 |
|
punycode.py | File | 6.72 KB | 0644 |
|
quopri_codec.py | File | 1.49 KB | 0644 |
|
raw_unicode_escape.py | File | 1.18 KB | 0644 |
|
rot_13.py | File | 2.4 KB | 0755 |
|
shift_jis.py | File | 1.01 KB | 0644 |
|
shift_jis_2004.py | File | 1.03 KB | 0644 |
|
shift_jisx0213.py | File | 1.03 KB | 0644 |
|
tis_620.py | File | 12.01 KB | 0644 |
|
undefined.py | File | 1.27 KB | 0644 |
|
unicode_escape.py | File | 1.16 KB | 0644 |
|
utf_16.py | File | 5.11 KB | 0644 |
|
utf_16_be.py | File | 1.01 KB | 0644 |
|
utf_16_le.py | File | 1.01 KB | 0644 |
|
utf_32.py | File | 5.01 KB | 0644 |
|
utf_32_be.py | File | 930 B | 0644 |
|
utf_32_le.py | File | 930 B | 0644 |
|
utf_7.py | File | 946 B | 0644 |
|
utf_8.py | File | 1005 B | 0644 |
|
utf_8_sig.py | File | 4.04 KB | 0644 |
|
uu_codec.py | File | 2.78 KB | 0644 |
|
zlib_codec.py | File | 2.15 KB | 0644 |
|