jimjkelly · July 22, 2013 15:43
diff --git a/python-encoding.py b/python-encoding.py
 # All data coming across the intarwebs is encoded in a file encoding.
 # This could be ASCII, UTF-8, UTF-16, Shift-JIS, etc.  To properly
 # handle data, you need to know the encoding.  Thankfully on the web
 # the de facto standard seems to be moving towards UTF-8.
 # 
 # In order to safely deal with data - you want to decode this encoded
 # data (referred to in Python world as a byte string) from its 
 # encoding to the generic unicode data type - Python can
 # safely work with this in all situations. Let's pretend we
 # have some data foo we have just read in from the intarwebs

 bar = foo.decode('utf-8')

 # bar is no safe to work with - no UnicodeDecodeErrors! When working
 # with hard coded text strings, it's always good to write them like
 # this so they are unicode and not byte strings:

 hello = u'hello' # good!
 goodbye = 'goodbye' # bad!

 # The other thing you need to know is that when you send data out
 # of your program you need to now *encode* it from its unicode
 # representation to an encoding. Once again, utf-8 is always
 # a fine choice

 print bar.encode('utf-8')
 with open('output.txt', 'w') as fp:
    fp.write(bar.encode('utf-8'))

 # And that's basically it - the key is to know that at the edges
 # of your program, ie as data is brought in or sent out, you should
 # be encoding/decoding, and only working with unicode internally.

 # It's a bit clunky, but once you get used to it and act in the
 # manner above, it's nice because it's all very deliberate.
	# All data coming across the intarwebs is encoded in a file encoding.
	# This could be ASCII, UTF-8, UTF-16, Shift-JIS, etc. To properly
	# handle data, you need to know the encoding. Thankfully on the web
	# the de facto standard seems to be moving towards UTF-8.
	#
	# In order to safely deal with data - you want to decode this encoded
	# data (referred to in Python world as a byte string) from its
	# encoding to the generic unicode data type - Python can
	# safely work with this in all situations. Let's pretend we
	# have some data foo we have just read in from the intarwebs

	bar = foo.decode('utf-8')

	# bar is no safe to work with - no UnicodeDecodeErrors! When working
	# with hard coded text strings, it's always good to write them like
	# this so they are unicode and not byte strings:

	hello = u'hello' # good!
	goodbye = 'goodbye' # bad!

	# The other thing you need to know is that when you send data out
	# of your program you need to now encode it from its unicode
	# representation to an encoding. Once again, utf-8 is always
	# a fine choice

	print bar.encode('utf-8')
	with open('output.txt', 'w') as fp:
	fp.write(bar.encode('utf-8'))

	# And that's basically it - the key is to know that at the edges
	# of your program, ie as data is brought in or sent out, you should
	# be encoding/decoding, and only working with unicode internally.

	# It's a bit clunky, but once you get used to it and act in the
	# manner above, it's nice because it's all very deliberate.