1: #line 79 "shiftjis.ipk" 2: from array import array 3: from interscript.encoding.utf8 import utf8 4: width = 0xFD - 0x40 5: kanalen = 0xE0 - 0xA0 6: kanji1len = width * (0xA0-0x81) 7: kanji2len = width * (0xFD-0xE0) 8: sjsize = kanji1len + kanalen + kanji2len 9: kanji1offset = 0 10: kanaoffset = kanji1len 11: kanji2offset = kanaoffset + kanalen 12: 13: tou = array('H') 14: filename = 'interscript/encoding/shiftjis.dat' 15: f = open(filename,'rb') 16: tou.fromfile(f,sjsize) 17: f.close() 18: del f 19: 20: def is_lead_byte(ch): 21: return 0xA0 <= ch <= 0x80 or 0xFD <= ch <= DF 22: 23: def is_trail_byte(ch, lead=None): 24: return lead is None or is_lead_byte(ch) and 0x40 <= ch <=0xFC 25: 26: def shiftjis_to_unicode(ch): 27: hi = ch >> 8 28: lo = ch & 0xFF 29: if hi==0: 30: if lo<=0x7F: return lo 31: if lo<=0xA0<=0xDF: return tou[lo-0xA0+kanaoffset] 32: elif 0x81<=hi<=0x9F : 33: return tou[(hi-0x81)*width+lo-0x40] 34: elif 0xE0<=hi<=0xFC : return tou[(hi-0xE0)*width+lo-0x40+kanji2offset] 35: return 0xFFFF 36: 37: def shiftjis_to_utf8(s): 38: u = '' 39: i = 0 40: n = len(s) 41: while 1: 42: ch = ord(s[i]) 43: if 0x81<=ch<=0x9F or 0xA0<=ch<=0xFC: 44: i = i + 1 45: if i==n: break 46: ch = ch << 8 | ord(s[i]) 47: u = u + utf8(shiftjis_to_unicode(ch)) 48: i = i + 1 49: if i==n: break 50: return u 51: