|
|
@@ -0,0 +1,45 @@ |
|
|
# |
|
|
# This function takes messy Word HTML pasted into a WYSIWYG and cleans it up |
|
|
# It leaves the tags and attributes specified in the params |
|
|
# |
|
|
|
|
|
require 'rubygems' |
|
|
require 'sanitize' |
|
|
|
|
|
def clean_up_word_html(html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={}) |
|
|
|
|
|
email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i |
|
|
|
|
|
html.gsub! /[\n|\r]/ , '' |
|
|
|
|
|
# keep only the things we want. |
|
|
html = Sanitize.clean( html, :elements => elements, :attributes => attributes ) |
|
|
|
|
|
# butt up any tags |
|
|
html.gsub! / / , ' ' |
|
|
html.gsub! />\s+</ , '><' |
|
|
|
|
|
#remove email address lines |
|
|
html.gsub! email_regex , '<p>' |
|
|
|
|
|
# post sanitize cleanup of empty blocks |
|
|
# the order of removal is import - this is the way word stacks these elements |
|
|
html.gsub! /<i><\/i>/ , '' |
|
|
html.gsub! /<b><\/b>/ , '' |
|
|
html.gsub! /<\/b><b>/ , '' |
|
|
html.gsub! /<p><\/p>/ , '' |
|
|
html.gsub! /<p><b><\/b><\/p>/ , '' |
|
|
|
|
|
# misc - fix butted times |
|
|
html.gsub! /(\d)am / , '\1 am ' |
|
|
html.gsub! /(\d)pm / , '\1 pm ' |
|
|
# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example) |
|
|
html.gsub! /\s+/ , ' ' |
|
|
|
|
|
# add new lines at the end of lines |
|
|
html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n" |
|
|
html.gsub! /<dl>/ , '<dl>' + "\n" |
|
|
|
|
|
html |
|
|
end |
|
|
|