# # This function takes messy Word HTML pasted into a WYSIWYG and cleans it up # It leaves the tags and attributes specified in the params # Copyright (c) 2009, Radio New Zealand # Released under the MIT license require 'rubygems' require 'sanitize' def clean_up_word_html(html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={}) email_regex = /

Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i html.gsub! /[\n|\r]/ , '' # keep only the things we want. html = Sanitize.clean( html, :elements => elements, :attributes => attributes ) # butt up any tags html.gsub! / / , ' ' html.gsub! />\s+<' #remove email address lines html.gsub! email_regex , '

' # post sanitize cleanup of empty blocks # the order of removal is import - this is the way word stacks these elements html.gsub! /<\/i>/ , '' html.gsub! /<\/b>/ , '' html.gsub! /<\/b>/ , '' html.gsub! /

<\/p>/ , '' html.gsub! /

<\/b><\/p>/ , '' # misc - fix butted times html.gsub! /(\d)am / , '\1 am ' html.gsub! /(\d)pm / , '\1 pm ' # misc - remove multiple space that may cause doc specific regexs to fail (in dates for example) html.gsub! /\s+/ , ' ' # add new lines at the end of lines html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '' + "\n" html.gsub! /

/ , '
' + "\n" html end