# # This function takes messy Word HTML pasted into a WYSIWYG and cleans it up # It leaves the tags and attributes specified in the params # Copyright (c) 2009, Radio New Zealand # Released under the MIT license require 'rubygems' require 'sanitize' def clean_up_word_html(html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={}) email_regex = /
Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i html.gsub! /[\n|\r]/ , '' # keep only the things we want. html = Sanitize.clean( html, :elements => elements, :attributes => attributes ) # butt up any tags html.gsub! / / , ' ' html.gsub! />\s+ , '><' #remove email address lines html.gsub! email_regex , '
' # post sanitize cleanup of empty blocks # the order of removal is import - this is the way word stacks these elements html.gsub! /<\/i>/ , '' html.gsub! /<\/b>/ , '' html.gsub! /<\/b>/ , '' html.gsub! /
<\/p>/ , '' html.gsub! /
<\/b><\/p>/ , ''
# misc - fix butted times
html.gsub! /(\d)am / , '\1 am '
html.gsub! /(\d)pm / , '\1 pm '
# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
html.gsub! /\s+/ , ' '
# add new lines at the end of lines
html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '\1>' + "\n"
html.gsub! // , '
' + "\n"
html
end