redtailtech · July 30, 2016 21:57 · Jul 3, 2009 · Jul 3, 2009
diff --git a/gistfile1.rb b/gistfile1.rb
@@ -1,7 +1,8 @@
 #
 # This function takes messy Word HTML pasted into a WYSIWYG and cleans it up
 # It leaves the tags and attributes specified in the params
-#
+# Copyright (c) 2009, Radio New Zealand
+# Released under the MIT license
 
 require 'rubygems'
 require 'sanitize'

diff --git a/gistfile1.rb b/gistfile1.rb
@@ -0,0 +1,45 @@
+#
+# This function takes messy Word HTML pasted into a WYSIWYG and cleans it up
+# It leaves the tags and attributes specified in the params
+#
+
+require 'rubygems'
+require 'sanitize'
+
+def clean_up_word_html(html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})
+
+  email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i
+
+  html.gsub! /[\n|\r]/    , ''
+
+  # keep only the things we want.
+  html = Sanitize.clean( html, :elements => elements, :attributes => attributes )
+
+  # butt up any tags
+  html.gsub! /&nbsp;/                 , ' '
+  html.gsub! />\s+</                  , '><'
+
+  #remove email address lines
+  html.gsub! email_regex              , '<p>'
+
+  # post sanitize cleanup of empty blocks
+  # the order of removal is import - this is the way word stacks these elements
+  html.gsub! /<i><\/i>/               , ''
+  html.gsub! /<b><\/b>/               , ''
+  html.gsub! /<\/b><b>/               , ''
+  html.gsub! /<p><\/p>/               , ''
+  html.gsub! /<p><b><\/b><\/p>/       , ''
+
+  # misc - fix butted times
+  html.gsub! /(\d)am /          , '\1 am '
+  html.gsub! /(\d)pm /          , '\1 pm '
+  # misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
+  html.gsub! /\s+/                  , ' '
+
+  # add new lines at the end of lines
+  html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n"
+  html.gsub! /<dl>/             , '<dl>' + "\n"
+
+  html
+end
+
No results found