use strict; use warnings; use HTML::Strip; use Devel::Peek; use Test::More tests => 3; use Encode; use utf8; =head1 Workaround for HTML::Strip with utf8 As discussed with Zefram and ilmari on #london.pm, thanks! L doesn't handle utf8 properly, as it's XS and probably not written to work on characters, only bytes. By default the parse method, when given unicode, returns a bytestring with no unicode markings. A naive way to handle this would be to simply decode_utf8. This works for utf8 strings... but not for extended latin1. A better workaround, suggested by Zefram, is to encode and downgrade first, then decode after. NB: this is just a workaround. Better solutions would be to a) fix HTML::Strip or b) use HTML::Parser instead =cut my @strings = ( { type => 'ascii', string => 'test', }, { type => 'unicode', string => "\x{2603}", # snowman }, { type => 'latin1', string => "L\x{e9}on", } ); my $hs = HTML::Strip->new(); for my $record (@strings) { my $string = $record->{string}; my $html = $string . "
"; # some sample html to strip # my $stripped = parse_simple( $html ); # fails the unicode test # my $stripped = parse_unicodey( $html ); # fails the latin1 test my $stripped = parse_workaround( $html ); is( $string, $stripped, $record->{type} ); # or do { Dump($string); Dump($stripped) }; } sub parse_simple { my $html = shift; my $stripped = $hs->parse($html); $hs->eof; return $stripped; } sub parse_unicodey { my $html = shift; my $stripped = $hs->parse($html); $hs->eof; return decode_utf8($stripped); } sub parse_workaround { my $html = shift; my $octets = encode_utf8($html); utf8::downgrade($octets); my $stripped = $hs->parse($octets); $hs->eof; return decode_utf8($stripped); }