# Originally written on 2017-04-26 19:03. # Welcome to my fancy Shift JIS explanation and simple implementation in Python. # # If you haven't heard of Shift JIS, it's a single or double-byte encoding for # the JIS X 0201 and JIS X 0208 character sets, a bit like UTF-8 is for Unicode. # # The key feature of Shift JIS is being able to use both character sets at once, # retaining legacy half-width ASCII and Katakana characters from JIS X 0201. # # First we're going to implement the JIS X 0201 standard. ASCII[1] only uses 7 # bits, meaning only 128 characters are defined. JIS X 0201 uses 8 bits like # most ASCII variants, meaning it has space for 256 instead.[2] ASCII variants # are generally split to two halves: The 'lower' half and 'upper' half. # # For JIS X 0201, the lower half is a modified ASCII layout. It contains control # characters which are used to change the text layout and make your terminal go # beep and printable characters which you're reading right now. # # The upper half (which gets its name by having its first of eight bits set) # contains some symbols and Katakana, 62 in total of the available 127 spaces. # JIS X 0211 adds more control characters[3] but we won't be implementing that. # # [1]: https://en.wikipedia.org/wiki/ASCII # [2]: https://en.wikipedia.org/wiki/JIS_X_0201 # [3]: https://en.wikipedia.org/wiki/JIS_X_0211 # Code time! For simplicity's sake (it'll become important later), characters # will be specified in hexadecimal. This is useful since you can line them up # with the Wikipedia graphs but also instantly tell if a number fits in to a # single byte since a single byte is two hexadecimal digits long. I'll also do # my best to make any range checks look like a number line. Error checking is # important, which is why this code throws a lot of exceptions if something # exceptional happens, as is the Python way. # # In addition, most the functions (if not all) are going to be tested using # exhaustive test functions. The idea is to knowingly generate some invalid # inputs and some valid inputs then compare to see if our functions can tell the # difference properly. In a lot of cases this will be done just by quickly # looping through all possible characters in a search space, as well as invalid # ones to verify that those are picked up too. When the tests fail, they'll just # print out some variables (see the source code to decipher them.) # # Overall, this source file is going to be divided on a per-codec basis, so # let's get started with JIS X 0201. # You'll notice there's no jisx0201_encode or decode functions. Since JIS X 0201 # can fit in a byte, there's no need for any. It maps to itself nicely. def jisx0201_valid(char): ascii = (0x00 <= char and char <= 0x7F) custom = (0xA1 <= char and char <= 0xDF) return (ascii or custom) def test_jisx0201_valid(): char = -0x100 while char <= 0x100: valid = jisx0201_valid(char) try: # This encoding should always be correct, so decode errors are just about # invalid characters and not invalid encodings. enc = bytes([char]) enc.decode("shiftjis") realValid = True except ValueError as e: # Converting to either bytes or Unicode failed, either way it's invalid. realValid = False if valid != realValid: print("%s %x %s" % (valid, char, realValid)) return False char += 0x01 return True print("test_jisx0201_valid: %s" % (test_jisx0201_valid())) # JIS X 0208 requires us to handle double bytes, so let's set up some functions # to be able to convert numbers to and from tuples of bytes like (b1, b2). def byte_valid(byte): return (0x00 <= byte and byte <= 0xFF) def test_byte_valid(): b = -0x100 while b <= 0x100: valid = byte_valid(b) if (b < 0x00 or 0xFF < b) and valid: print("%x" % (b)) return False b += 0x01 return True print("test_byte_valid: %s" % (test_byte_valid())) # Now we need some functions to pack and unpack two bytes. In hexadecimal, a # byte is two digits, so this code is equivalent to the decimal version of # multiplying a two digits by 100 then adding the second number below back. def db_pack(b1, b2): if byte_valid(b1) and byte_valid(b2): return ((b1 * 0x100) + b2) else: raise ValueError("db_pack: Invalid byte(s) %x %x" % (b1, b2)) def db_unpack(bytes): if 0xFFFF < bytes: raise ValueError("db_unpack: 'bytes' larger than two bytes" % (bytes)) # To unpack bytes divide by 100 to get the first digits then modulo to get the # remainder for the last two digits. return (bytes // 0x100, bytes % 0x100) def test_dbpacking(): b1 = -0x10 while b1 < 0x110: b2 = -0x10 while b2 <= 0x110: valid = (byte_valid(b1) and byte_valid(b2)) try: packed = db_pack(b1, b2) (b3, b4) = db_unpack(packed) if b1 != b3 or b2 != b4: print("%x-%x %x %x-%x" % (b1, b2, packed, b3, b4)) return False realValid = True except ValueError as e: realValid = False if valid != realValid: print("%x-%x" % (b1, b2)) b2 +=1 b1 += 1 return True print("test_dbpacking: %s" % (test_dbpacking())) # So far it's pretty simple. Next up we need to look at implementing some # functions for JIS X 0208.[4] Unlike Unicode which is one-dimensional with # sections marked as planes, JIS X 0208 is divided in to a 94x94 grid, with each # character being put in a particular row in a particular cell. # # Why 94? The reason is ISO/IECC 2022.[5] Instead of using 8 bits like JIS X # 0201, JIS X 0208 and some other standards define their character characters by # fitting their rows and cells in to two bytes compatible with 7-bit printable # ASCII. Control characters can then be used to switch between different # encodings in the middle of text. Because of this, a JIS X 0208 character is # actually two bytes stuck together in the printable ASCII range (0x20 - 0x7E, # 0x7F is 'delete' for some reason) rather than being a continuous set like # Unicode. This means that the character '0x2121' is actually row 1, cell 1 and # the character '0x7E7E' is row 94, 94. # # It also means that if we were to lay out the characters continuously for every # number like Unicode does, we'd find that each row (say, 0x21XX) is actually a # byte long, containing 256 with only 94 used, ignoring 162. This tradeoff is # made so that you can take any JIS X 0208 character and display it as mangled # ASCII text rather than random control characters that make your text mess up. # For example, row 3, cell 16 is '0' (converted to Unicode) or '#0' in mangled # ASCII which can then be converted back to JIS X 0208. # # There's a table of JIS X 0208 characters online[6] which shows all the rows # and cells. Don't be fooled, the cells don't have their own rows and columns # (confusingly the Wikipedia page seems to imply this), that's just for display. # In it you can see the JIS character code and the Shift JIS encoded character, # and a few interesting issues: The entire character set is wide, not narrow # like JIS X 0201, and it contains unallocated blocks. The first issue is why # people switch between JIS X 0201 and JIS X 0208, and the second issue is why a # lot of systems still use Shift JIS: They use these blocks for themselves. # # [4]: https://en.wikipedia.org/wiki/JIS_X_0208 # [5]: https://en.wikipedia.org/wiki/ISO/IEC_2022 # [6]: http://www.asahi-net.or.jp/~AX2S-KMTN/ref/jisx0208.html # For understanding's sake, we're going to deal with JIS X 0208 by specifying # rows and cells rather than encoded characters in hexadecimal. def jisx0208_valid(row, cell): valid_row = (1 <= row and row <= 94) valid_cell = (1 <= cell and cell <= 94) return (valid_row and valid_cell) def jisx0208_encode(row, cell): if jisx0208_valid(row, cell): # Add 0x20 to align to ASCII printable characters, then pack it. return db_pack(row + 0x20, cell + 0x20) else: raise ValueError("jisx0208_encode: invalid character %i,%i" % (row, cell)) def jisx0208_decode(character): (row, cell) = db_unpack(character) return (row - 0x20, cell - 0x20) def test_jisx0208_codec(): row = -10 while row <= 100: cell = -10 while cell <= -100: valid = ((1 <= row and row <= 94) and (1 <= cell and cell <= 94)) try: character = jisx0208_encode(row, cell) (row2, cell2) = jisx0208_decode(character) if row != row2 or cell != cell2: print("%i,%i %x %i,%i" % (row, cell, character, row2, cell2)) return False realValid = True except ValueError as e: realValid = False if valid != realValid: print("%i-%i" % (row, cell)) cell +=1 row += 1 return True print("test_jisx0208_codec: %s" % (test_jisx0208_codec())) # Ok, now that we can encode both character sets it's time to encode Shift JIS! # First, let's study the map of Shift JIS.[7] Take a moment to drink it in. # # We can see that Shift JIS is a superset of JIS X 0201, rather than 7-bit # ASCII. The first byte can either be JIS X 0201 or the start of a shifted JIS X # 0208 character. It gets its name from 'shifting' the first byte of its # encoding around the upper half of JIS X 0201 in the unallocated 65 characters. # # When encoding, the first byte only uses 47 of the 65 characters that could be # used for encoding, and the second byte maps 188 characters for use. If you add # them together, 47x188 and 94x94 are both 8836, so all together there's enough # room to store a JIS X 0208 character. # # The key to Shift JIS is how it redistributes the bits of a JIS X 0208 # character: You can't fit the row number in to one of the 65 characters, so # part of the number needs to be stored in the second byte. It does this by # moving whether the row is odd or even from a bit in the row number to a # position-based indication in the second byte. This halves the first byte's # needed characters to 47 and doubles the second byte's to 188. # # It's worth noting that unlike ASCII or UTF-8, there's no bit packing or # masking that allow you to extract the character directly from the bytes. # Instead, there's some weird offsets that actually affect the code: The first # byte starts being encoded at 0x81 instead of 0x80. All the other indices start # at 0, but this one just starts at 1 for some reason. I don't know why. # # The second byte is a lot stranger: The section for odd rows starts at 0x40, # meaning the code now needs to skip the 0x7F 'DEL' control code. It also # overlaps with JIS X 0211 control codes, meaning it could actually mess up your # terminal unlike a 7-bit encoding. Moving that section back to 0x20 would solve # both issues. Additionally, the section for even rows starts straight after at # 0x9F. Moving that forward to 0xA0 would mean you could check determine if the # row is odd by whether the 8th bit is set instead of checking a range. # # All of this hassle gets us an encoding that supports multiple character sets # without the stateful control codes used in ISO/IEC 2022 and backwards # compatibility with JIS X 0201 text. As far as I can tell, a much nicer # encoding (EUC-JP)[8] was available since 1993 but instead Shift JIS was # standardized by Microsoft in 1997 just to have that backwards compatibility. # Interestingly enough, Microsoft uses a non-standard extension of Shift JIS.[9] # # [7]: https://en.wikipedia.org/wiki/Shift_JIS#Shift_JIS_byte_map # [8]: https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-JP # [9]: https://en.wikipedia.org/wiki/Code_page_943 def shiftjis_encode_jisx0208(character): (row, cell) = jisx0208_decode(character) if not jisx0208_valid(row, cell): raise ValueError("shiftjis_encode: invalid character %i,%i" % (row, cell)) # For the first byte... b1 = row + 1 # Add 1 so the encoding starts at 0x81. b1 = b1 // 2 # Shrink the value to fit, losing the odd/even bit. # Shift the byte based on whether it can fit behind the JIS X 0201 symbols. if b1 < 32: b1 = b1 + 0x80 else: b1 = b1 + 0xE0 b1 = b1 - 32 # Bytes ahead of the symbols are implied to start at 32. # For the second byte... row_odd = (row % 2 == 1) # Check if the row is odd. offset = 0x40 if row_odd else 0x9F # Pick the indicative offset. b2 = offset + cell - 1 # Shift to the appropriate location, cells start at 1. # For odd rows, shift past the 'DEL' control character at 0x7F. if row_odd and 0x7F <= b2: b2 += 1 return db_pack(b1, b2) def shiftjis_decode_jisx0208(db): (b1, b2) = db_unpack(db) # For the first byte... if 0x81 <= b1 and b1 <= 0x9F: row = b1 - 0x80 # Shift its value from behind JIS X 0201 symbols. elif 0xE0 <= b1 and b1 <= 0xFF: row = b1 - 0xE0 # Shift its value from ahead of the symbols. row = row + 32 # Add 32 to compensate for values ahead starting from 0. else: raise ValueError("shiftjis_decode_jisx0208: invalid first byte %x" % (b1)) # Expand the row back to an even number. While dividing by 2 would floor the # value meaning the odd number corresponding to this cell would be next, # adding 1 so the encoding starts at 0x81 means it actually gets round up to # the ceiling, making the odd number below this number. row = row * 2 # For the second byte... if 0x40 <= b2 and b2 <= 0x9E: # Odd row! row = row - 1 # Set to corresponding odd number. cell = b2 - 0x40 # Compensate for shifting past the 'DEL' control character. if 0x7F < b2: cell = cell - 1 elif b2 == 0x7F: raise ValueError("shiftjis_decode_jisx0208: invalid second byte %x" % (b2)) elif 0x9F <= b2 and b2 <= 0xFC: cell = b2 - 0x9F else: raise ValueError("shiftjis_decode_jisx0208: invalid second byte %x" % (b2)) # Shift forward since cells start at 1. cell = cell + 1 return jisx0208_encode(row, cell) # Make sure the decoder doesn't accept rubbish input. def test_shiftjis_jisx0208_decoder(): b1 = -0x10 while b1 <= 100: b2 = -0x10 while b2 <= 100: bs_valid = (byte_valid(b1) and byte_valid(b2)) b1_valid = ((0x81 <= b1 and b1 <= 0x9F) or (0xE0 <= b1 and b1 <= 0xFF)) b2_valid = ((0x40 <= b2 and b1 <= 0x9E) or (0x9F <= b1 and b1 <= 0xFC)) b2_not_del = (b2 != 0x7F) valid = (bs_valid and b1_valid and b2_valid and b2_not_del) try: enc = db_pack(b1, b2) decoded = shiftjis_decode_jisx0208(enc) realValid = True if valid != realValid: print("%x-%x %x %x %s" % (b1, b2, enc, decoded, realValid)) return False except ValueError as e: realValid = False if valid != realValid: print("%x-%x %s" % (b1, b2, realValid)) return False b2 +=1 b1 += 1 return True print("test_shiftjis_jisx0208_decoder: %s" % (test_shiftjis_jisx0208_decoder())) # Make sure the encoder and decoder don't mangle the data. def test_shiftjis_jisx0208_encoder(): row = 1 while row <= 94: cell = 1 while cell <= 94: valid = True try: character = jisx0208_encode(row, cell) enc = shiftjis_encode_jisx0208(character) character2 = shiftjis_decode_jisx0208(enc) (row2, cell2) = jisx0208_decode(character2) if row != row2 or cell != cell2 or character != character2: print("%i,%i %x %x %x %i,%i" % (row, cell, character, enc, character2, row2, cell2)) return False realValid = True except ValueError as e: realValid = False if valid != realValid: print("%i,%i %s" % (row, cell, realValid)) return False cell +=1 row += 1 return True print("test_shiftjis_jisx0208_encoder: %s" % (test_shiftjis_jisx0208_encoder())) # Okay, so that's the hard part done. We now have the ability to encode and # decode text in the two ways Shift JIS does: JIS X 0201 and mangled JIS X 0208. # Now all we need is to write the Shift JIS bytes encoder! The idea is that the # accepted input will be encoded characters from either JIS X 0201 or JIS X 0208 # and output will be bytes, and vice-versa for decoding. These are just simple # wrappers and don't need much error checking since the other functions do that. def shiftjis_encode(characters): encoded = bytearray() for character in characters: if jisx0201_valid(character): encoded.append(character) else: (row, cell) = jisx0208_decode(character) if jisx0208_valid(row, cell): (b1, b2) = db_unpack(shiftjis_encode_jisx0208(character)) encoded.append(b1) encoded.append(b2) else: raise ValueError("shiftjis_encode: invalid character %x" % (character)) return encoded def shiftjis_decode(bytes): characters = [] head = None for byte in bytes: if head == None: if jisx0201_valid(byte): characters.append(byte) else: head = byte else: encoded = db_pack(head, byte) decoded = shiftjis_decode_jisx0208(encoded) characters.append(decoded) head = None if head != None: # Trailing first byte? raise ValueError("shiftjis_decode: trailing first byte %x" % (head)) return characters # Testing the mix of JIS X 0201 and JIS X 0208 characters is a bit hard without # fuzz testing, and the tests I've been doing so far have just been exhaustive. # I don't want to write a random testing framework right now, so let's just use # traditional unit tests with some examples that should shake out bad code. def test_shiftjis_codec(): valid_tests = [ # Just ASCII. [ord('H'), ord('e'), ord('l'), ord('l'), ord('o'), 0x00], # ASCII with a JIS X 0208 character. [ord('H'), ord('e'), ord('y'), jisx0208_encode(0x1, 0xA), 0x00], # JIS X 0201 characters with a JIS X 0208 character. [0xA2, 0xA3, jisx0208_encode(0x1, 0xA), 0x00] ] for test in valid_tests: try: enc = shiftjis_encode(test) dec = shiftjis_decode(enc) except ValueError as e: print("%s" % (test)) return False # Have to have invalid tests made up of bytes to avoid all the safety checks # in the functions we've created. invalid_tests = [ # Invalid second byte. bytearray(b'hello!\x81\x10\x00'), # Unused first bytes. bytearray(b'hello!\x80\x10\x21'), bytearray(b'hello!\x80\xFE\x21'), # Unused second bytes. bytearray(b'hello!\x80\x81\x30'), bytearray(b'hello!\x80\x81\xFF'), # Trailing first byte. bytearray(b'hello!\x91') ] for test in invalid_tests: try: dec = shiftjis_decode(test) print("%s" % (test)) return False except ValueError as e: pass return True print("test_shiftjis_codec: %s" % (test_shiftjis_codec())) # Whew! Everything should be fine, and the encoder should work. Of course, it's # still possible that it's outputting garbage that it can somehow encode and # decode losslessly. (I really doubt that, though.) But just in case there's # more development to do, we're going to do a final test by using Python's built # in encoder/decoder to do a round trip for comparison. # # A small snag is that Python will fail characters that can't be converted to # Unicode, even if they can be encoded in Shift JIS (this includes private use # characters like emoji). The solution to this is for our tests to check if the # character in a row or cell is allocated in the standard, since the standard # has since been incorporated to Unicode. def jisx0208_allocated(row, cell): if not jisx0208_valid(row, cell): return False if (8 < row and row < 16) or (84 < row): # Unassigned rows. return False # These are all transcribed based on the JIS 0208 chart listed earlier. unallocated = { '2': [(15, 25), (34, 41), (49, 59), (75, 81), (90, 93)], '3': [(1, 15), (26, 32), (59, 64), (91, 94)], '4': [(84, 94)], '5': [(87, 94)], '6': [(25, 32), (57, 94)], '7': [(34, 48), (82, 94)], '8': [(33, 94)], '47': [(52, 94)], '84': [(7, 95)]} if unallocated.get(str(row)): for i in unallocated.get(str(row)): if i[0] <= cell and cell <= i[1]: return False return True def test_shiftjis_external(): # Loop through all the rows and make sure that we can do a round trip of # encoding and decoding of valid characters. row = 1 while row <= 94: points = [] cell = 0 while cell <= 94: if jisx0208_allocated(row, cell): encoded = jisx0208_encode(row, cell) points.append(encoded) cell += 1 # Mix in some JIS X 0201 characters! points.append(ord('H')) points.append(ord('e')) points.append(ord('y')) points.append(0xA2) points.append(0xA3) try: enc = shiftjis_encode(points) enc2 = enc.decode("shift_jis").encode("shift_jis") if enc != enc2: print("%s %s %s %s" % (points, enc, dec, dec2)) return False except Exception as e: print("%s" % (points)) row += 1 return True print("test_shiftjis_external: %s" % (test_shiftjis_external())) # Well, that's all. I hope you learned something today. If you want to mess with # the code some more, I suggest adding support for Microsoft's Code Page 943 # variant and other non-standard variants. I think some of them just add more # rows so it only works in Shift JIS and not ISO/IECC 2022, or use unallocated # rows. There's also JIS X 0213 which expands the character set in to two # planes, but I haven't found much documentation on how this works since I can't # read Japanese. I hope if I've gotten any point to you across, it's that Shift # JIS isn't that scary once you stop using formulas and decimal numbers. The # only bizarre moments come from the choice of offsets. See you later! # The author of this file has dedicated its contents to the public domain # using the CC0 Public Domain Dedication 1.0. For full legal information see # .