6.5.6. UTF-16

UTF16 is a hybrid 2/4 bytes encoding of ISO-10646. It is compatible with UCS2 (unicode), except that a word x in the range D800-DBFF introduces a two word encoding, the second word y being in the range DC00-DFFF. These ranges are reserved in ISO-10646 for UTF16. The value is 0001 0000 + (x - D800) * 0400 + y - DC00.
Start python section to interscript/encoding/utf16.py[1 /1 ]
     1: #line 302 "utf8.ipk"
     2: import string
     3: 
     4: def utf16(i):
     5:   if i<0x10000:
     6:     return chr(i>>8) + chr(i &0xff)
     7:   else
     8:     w1 = 0xD800 + ((i - 0x10000) >> 10)
     9:     w2 = 0xDC00 + ((i - 0x10000) & 0x3FF)
    10:     return\
    11:       chr(w1>>8) + chr(w1 &0xff) +\
    12:       chr(w2>>8) + chr(w2 &0xff)
    13: 
    14: def seq_to_utf16(a):
    15:   s = ''
    16:   for ch in a: s = s + utf16(ch)
    17:   return s
    18: 
    19: # decoding
    20: def parse_utf16(s, i):
    21:   if s[0] < 0xDC or s[0] > 0xE0:
    22:     return s[0] << 8 + s[1],2
    23:   else:
    24:     w1 = s[0] << 8 + s[1]
    25:     w2 = s[2] << 8 + s[3]
    26:     return 0x10000 + ((w1 - 0xD800) << 10) + w2 -0xDC00, 4
    27: 
    28: def utf16_to_array(s):
    29:   n = len(s)
    30:   i = 0
    31:   m = 0
    32:   while i < n:
    33:     parse_utf8(s,i)
    34:     m = m + 1
    35:   a = array('H',(0,)*m)
    36:   i = 0
    37:   while i < n:
    38:     a[i/4],i = parse_utf16(s,i)
    39: 
    40: def utf16_to_utf8(s):
    41:   return seq_to_utf8(utf16_to_array(s))
    42: 
    43: 
End python section to interscript/encoding/utf16.py[1]