|
|
@@ -0,0 +1,199 @@ |
|
|
#import "NSData+OADataHelpers.h" |
|
|
|
|
|
@implementation NSData (OADataHelpers) |
|
|
|
|
|
- (NSString*) UTF8String |
|
|
{ |
|
|
return [[[NSString alloc] initWithData:[self dataByHealingUTF8Stream] encoding:NSUTF8StringEncoding] autorelease]; |
|
|
} |
|
|
|
|
|
// Replaces all broken sequences by � character and returns NSData with valid UTF-8 bytes. |
|
|
- (NSData*) dataByHealingUTF8Stream |
|
|
{ |
|
|
// bits |
|
|
// 7 U+007F 0xxxxxxx |
|
|
// 11 U+07FF 110xxxxx 10xxxxxx |
|
|
// 16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx |
|
|
// 21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
|
|
// 26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
|
|
// 31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
|
|
|
|
|
#define b00000000 0x00 |
|
|
#define b10000000 0x80 |
|
|
#define b11000000 0xc0 |
|
|
#define b11100000 0xe0 |
|
|
#define b11110000 0xf0 |
|
|
#define b11111000 0xf8 |
|
|
#define b11111100 0xfc |
|
|
#define b11111110 0xfe |
|
|
|
|
|
static NSString* replacementCharacter = @"�"; |
|
|
NSData* replacementCharacterData = [replacementCharacter dataUsingEncoding:NSUTF8StringEncoding]; |
|
|
|
|
|
NSMutableData* resultData = [NSMutableData dataWithCapacity:[self length]]; |
|
|
const char *bytes = [self bytes]; |
|
|
NSUInteger length = [self length]; |
|
|
|
|
|
static const NSUInteger bufferMaxSize = 1024; |
|
|
char buffer[bufferMaxSize]; // not initialized, but will be filled in completely before copying to resultData |
|
|
NSUInteger bufferIndex = 0; |
|
|
|
|
|
#define FlushBuffer() if (bufferIndex > 0) { \ |
|
|
[resultData appendBytes:buffer length:bufferIndex]; \ |
|
|
bufferIndex = 0; \ |
|
|
} |
|
|
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \ |
|
|
[resultData appendBytes:buffer length:bufferIndex]; \ |
|
|
bufferIndex = 0; \ |
|
|
} |
|
|
|
|
|
NSUInteger byteIndex = 0; |
|
|
BOOL invalidByte = NO; |
|
|
while (byteIndex < length) |
|
|
{ |
|
|
char byte = bytes[byteIndex]; |
|
|
|
|
|
if ((byte & b10000000) == b00000000) // 0xxxxxxx |
|
|
{ |
|
|
CheckBuffer(); |
|
|
buffer[bufferIndex++] = byte; |
|
|
} |
|
|
else if ((byte & b11100000) == b11000000) // 110xxxxx 10xxxxxx |
|
|
{ |
|
|
if (byteIndex+1 >= length) { |
|
|
FlushBuffer(); |
|
|
return resultData; |
|
|
} |
|
|
char byte2 = bytes[++byteIndex]; |
|
|
if ((byte2 & b11000000) == b10000000) |
|
|
{ |
|
|
CheckBuffer(); |
|
|
buffer[bufferIndex++] = byte; |
|
|
buffer[bufferIndex++] = byte2; |
|
|
} |
|
|
else |
|
|
{ |
|
|
invalidByte = YES; |
|
|
} |
|
|
} |
|
|
else if ((byte & b11110000) == b11100000) // 1110xxxx 10xxxxxx 10xxxxxx |
|
|
{ |
|
|
if (byteIndex+2 >= length) { |
|
|
FlushBuffer(); |
|
|
return resultData; |
|
|
} |
|
|
char byte2 = bytes[++byteIndex]; |
|
|
char byte3 = bytes[++byteIndex]; |
|
|
if ((byte2 & b11000000) == b10000000 && |
|
|
(byte3 & b11000000) == b10000000) |
|
|
{ |
|
|
CheckBuffer(); |
|
|
buffer[bufferIndex++] = byte; |
|
|
buffer[bufferIndex++] = byte2; |
|
|
buffer[bufferIndex++] = byte3; |
|
|
} |
|
|
else |
|
|
{ |
|
|
invalidByte = YES; |
|
|
} |
|
|
} |
|
|
else if ((byte & b11111000) == b11110000) // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
|
|
{ |
|
|
if (byteIndex+3 >= length) { |
|
|
FlushBuffer(); |
|
|
return resultData; |
|
|
} |
|
|
char byte2 = bytes[++byteIndex]; |
|
|
char byte3 = bytes[++byteIndex]; |
|
|
char byte4 = bytes[++byteIndex]; |
|
|
if ((byte2 & b11000000) == b10000000 && |
|
|
(byte3 & b11000000) == b10000000 && |
|
|
(byte4 & b11000000) == b10000000) |
|
|
{ |
|
|
CheckBuffer(); |
|
|
buffer[bufferIndex++] = byte; |
|
|
buffer[bufferIndex++] = byte2; |
|
|
buffer[bufferIndex++] = byte3; |
|
|
buffer[bufferIndex++] = byte4; |
|
|
} |
|
|
else |
|
|
{ |
|
|
invalidByte = YES; |
|
|
} |
|
|
} |
|
|
else if ((byte & b11111100) == b11111000) // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
|
|
{ |
|
|
if (byteIndex+4 >= length) { |
|
|
FlushBuffer(); |
|
|
return resultData; |
|
|
} |
|
|
char byte2 = bytes[++byteIndex]; |
|
|
char byte3 = bytes[++byteIndex]; |
|
|
char byte4 = bytes[++byteIndex]; |
|
|
char byte5 = bytes[++byteIndex]; |
|
|
if ((byte2 & b11000000) == b10000000 && |
|
|
(byte3 & b11000000) == b10000000 && |
|
|
(byte4 & b11000000) == b10000000 && |
|
|
(byte5 & b11000000) == b10000000) |
|
|
{ |
|
|
CheckBuffer(); |
|
|
buffer[bufferIndex++] = byte; |
|
|
buffer[bufferIndex++] = byte2; |
|
|
buffer[bufferIndex++] = byte3; |
|
|
buffer[bufferIndex++] = byte4; |
|
|
buffer[bufferIndex++] = byte5; |
|
|
} |
|
|
else |
|
|
{ |
|
|
invalidByte = YES; |
|
|
} |
|
|
} |
|
|
else if ((byte & b11111110) == b11111100) // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
|
|
{ |
|
|
if (byteIndex+5 >= length) { |
|
|
FlushBuffer(); |
|
|
return resultData; |
|
|
} |
|
|
char byte2 = bytes[++byteIndex]; |
|
|
char byte3 = bytes[++byteIndex]; |
|
|
char byte4 = bytes[++byteIndex]; |
|
|
char byte5 = bytes[++byteIndex]; |
|
|
char byte6 = bytes[++byteIndex]; |
|
|
if ((byte2 & b11000000) == b10000000 && |
|
|
(byte3 & b11000000) == b10000000 && |
|
|
(byte4 & b11000000) == b10000000 && |
|
|
(byte5 & b11000000) == b10000000 && |
|
|
(byte6 & b11000000) == b10000000) |
|
|
{ |
|
|
CheckBuffer(); |
|
|
buffer[bufferIndex++] = byte; |
|
|
buffer[bufferIndex++] = byte2; |
|
|
buffer[bufferIndex++] = byte3; |
|
|
buffer[bufferIndex++] = byte4; |
|
|
buffer[bufferIndex++] = byte5; |
|
|
buffer[bufferIndex++] = byte6; |
|
|
} |
|
|
else |
|
|
{ |
|
|
invalidByte = YES; |
|
|
} |
|
|
} |
|
|
else |
|
|
{ |
|
|
invalidByte = YES; |
|
|
} |
|
|
|
|
|
if (invalidByte) |
|
|
{ |
|
|
invalidByte = NO; |
|
|
FlushBuffer(); |
|
|
[resultData appendData:replacementCharacterData]; |
|
|
} |
|
|
|
|
|
byteIndex++; |
|
|
} |
|
|
FlushBuffer(); |
|
|
return resultData; |
|
|
} |
|
|
|
|
|
@end |