From d9b6641a99eaa514d704f5b08f21dfd882ac727e Mon Sep 17 00:00:00 2001 From: relikd Date: Wed, 6 Mar 2019 02:05:09 +0100 Subject: [PATCH] Fix crash when libxml set error in @autoreleasepool - libxml will return first parsing error instead of last one - option to replace lower ascii chars with whitespace --- RSXML/RSSAXParser.h | 1 + RSXML/RSSAXParser.m | 25 +++++++++++++++++++- RSXML/RSXMLError.m | 1 - RSXML/RSXMLParser.h | 1 + RSXML/RSXMLParser.m | 34 ++++++++++++++++------------ RSXMLTests/RSXMLTests.m | 22 ++++++++++++++++++ RSXMLTests/Resources/broken.rss | 26 +++++++++++++++++++++ RSXMLTests/Resources/lower-ascii.rss | 30 ++++++++++++++++++++++++ 8 files changed, 124 insertions(+), 16 deletions(-) create mode 100644 RSXMLTests/Resources/broken.rss create mode 100644 RSXMLTests/Resources/lower-ascii.rss diff --git a/RSXML/RSSAXParser.h b/RSXML/RSSAXParser.h index 75c26a3..28b52a4 100644 --- a/RSXML/RSSAXParser.h +++ b/RSXML/RSSAXParser.h @@ -61,6 +61,7 @@ @interface RSSAXParser : NSObject +@property (nonatomic, strong, readonly) NSError *parsingError; @property (nonatomic, strong, readonly) NSData *currentCharacters; @property (nonatomic, strong, readonly) NSString *currentString; @property (nonatomic, strong, readonly) NSString *currentStringWithTrimmedWhitespace; diff --git a/RSXML/RSSAXParser.m b/RSXML/RSSAXParser.m index 126bace..a647c2a 100644 --- a/RSXML/RSSAXParser.m +++ b/RSXML/RSSAXParser.m @@ -27,6 +27,8 @@ #import #import "RSSAXParser.h" +const NSErrorDomain kLIBXMLParserErrorDomain = @"LIBXMLParserErrorDomain"; + @interface RSSAXParser () @property (nonatomic, weak) id delegate; @@ -97,6 +99,8 @@ static xmlSAXHandler saxHandlerStruct; */ - (void)parseBytes:(const void *)bytes numberOfBytes:(NSUInteger)numberOfBytes { + _parsingError = nil; + if (self.context == nil) { if (self.isHTMLParser) { xmlCharEncoding characterEncoding = xmlDetectCharEncoding(bytes, (int)numberOfBytes); @@ -342,6 +346,11 @@ static xmlSAXHandler saxHandlerStruct; } } +- (void)xmlParsingErrorOccured:(NSError*)error { + if (!self.parsingError) // grep first encountered error + _parsingError = error; +} + @end @@ -369,6 +378,20 @@ static void endElementSAX_HTML(void *context, const xmlChar *localname) { [(__bridge RSSAXParser *)context xmlEndHTMLElement:localname]; } +static void errorOccuredSAX(void *context, const char *format, ...) { + xmlErrorPtr err = xmlGetLastError(); + if (err && err->level == XML_ERR_FATAL) { + int errCode = err->code; + char * msg = err->message; + NSString *errMsg = [[NSString stringWithFormat:@"%s", msg] stringByTrimmingCharactersInSet: + [NSCharacterSet whitespaceAndNewlineCharacterSet]]; + NSError *error = [NSError errorWithDomain:kLIBXMLParserErrorDomain code:errCode + userInfo:@{ NSLocalizedDescriptionKey: errMsg }]; + [(__bridge RSSAXParser *)context xmlParsingErrorOccured:error]; + } + xmlResetLastError(); +} + static xmlSAXHandler saxHandlerStruct = { nil, /* internalSubset */ @@ -393,7 +416,7 @@ static xmlSAXHandler saxHandlerStruct = { nil, /* processingInstruction */ nil, /* comment */ nil, /* warning */ - nil, /* error */ + errorOccuredSAX, /* error */ nil, /* fatalError //: unused error() get all the errors */ nil, /* getParameterEntity */ nil, /* cdataBlock */ diff --git a/RSXML/RSXMLError.m b/RSXML/RSXMLError.m index 6ec44c2..fd16d31 100644 --- a/RSXML/RSXMLError.m +++ b/RSXML/RSXMLError.m @@ -23,7 +23,6 @@ #import "RSXMLError.h" -const NSErrorDomain kLIBXMLParserErrorDomain = @"LIBXMLParserErrorDomain"; const NSErrorDomain kRSXMLParserErrorDomain = @"RSXMLParserErrorDomain"; const char * parserDescriptionForError(RSXMLError code); diff --git a/RSXML/RSXMLParser.h b/RSXML/RSXMLParser.h index 3c33fda..2f9a4fa 100644 --- a/RSXML/RSXMLParser.h +++ b/RSXML/RSXMLParser.h @@ -58,6 +58,7 @@ @interface RSXMLParser<__covariant T> : NSObject @property (nonatomic, readonly, nonnull, copy) NSString *documentURI; +@property (nonatomic, assign) BOOL dontStopOnLowerAsciiBytes; + (instancetype)parserWithXMLData:(RSXMLData * _Nonnull)xmlData; diff --git a/RSXML/RSXMLParser.m b/RSXML/RSXMLParser.m index de3791c..c4ce446 100644 --- a/RSXML/RSXMLParser.m +++ b/RSXML/RSXMLParser.m @@ -21,8 +21,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#import - #import "RSXMLParser.h" #import "RSXMLData.h" #import "RSXMLError.h" @@ -71,6 +69,22 @@ return self; } +/** + XML allows only specific lower ascii characters (<0x20), namely 0x9, 0xA, and 0xD. + See: https://www.w3.org/TR/xml/#charsets + */ +- (void)replaceLowerAsciiBytesWithSpace { + [_xmlData enumerateByteRangesUsingBlock:^(const void * bytes, NSRange byteRange, BOOL * stop) { + NSUInteger max = byteRange.location + byteRange.length; + for (NSUInteger i = byteRange.location; i < max; i++) { + unsigned char c = ((unsigned char*)bytes)[i]; + if (c < 0x20 && c != 0x9 && c != 0xA && c != 0xD) { + ((unsigned char*)bytes)[i] = ' '; // replace lower ascii with blank + } + } + }]; +} + /** Parse the XML data on whatever thread this method is called. @@ -82,24 +96,16 @@ if (error) *error = _xmlInputError; return nil; } - + if (_dontStopOnLowerAsciiBytes) { + [self replaceLowerAsciiBytesWithSpace]; + } if ([self respondsToSelector:@selector(xmlParserWillStartParsing)] && ![self xmlParserWillStartParsing]) return nil; @autoreleasepool { - xmlResetLastError(); [_parser parseBytes:_xmlData.bytes numberOfBytes:_xmlData.length]; - if (error) { - xmlErrorPtr err = xmlGetLastError(); - if (err && err->level == XML_ERR_FATAL) { - int errCode = err->code; - char * msg = err->message; - NSString *errMsg = [[NSString stringWithFormat:@"%s", msg] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]; - *error = [NSError errorWithDomain:kLIBXMLParserErrorDomain code:errCode userInfo:@{NSLocalizedDescriptionKey: errMsg}]; - } - xmlResetLastError(); - } } + if (error) *error = _parser.parsingError; return [self xmlParserWillReturnDocument]; } diff --git a/RSXMLTests/RSXMLTests.m b/RSXMLTests/RSXMLTests.m index 352fbc1..cf014f2 100644 --- a/RSXMLTests/RSXMLTests.m +++ b/RSXMLTests/RSXMLTests.m @@ -265,6 +265,28 @@ #pragma clang diagnostic pop } +- (void)testLowerAsciiCharacters { + NSError *error = nil; + RSXMLData *xmlData = [self xmlFile:@"lower-ascii" extension:@"rss"]; + RSXMLParser *parser = [xmlData getParser]; + RSParsedFeed *parsedFeed = [parser parseSync:&error]; + XCTAssertNotNil(error); + XCTAssertEqual(parsedFeed.articles.count, 2); + parser.dontStopOnLowerAsciiBytes = YES; + parsedFeed = [parser parseSync:&error]; + XCTAssertNil(error); + XCTAssertEqual(parsedFeed.articles.count, 5); +} + +- (void)testBrokenXML { + NSError *error = nil; + RSXMLData *xmlData = [self xmlFile:@"broken" extension:@"rss"]; + [[xmlData getParser] parseSync:&error]; + XCTAssertNotNil(error); + XCTAssertEqual(error.code, 76); + XCTAssertEqualObjects(error.localizedDescription, @"Opening and ending tag mismatch: channel line 0 and rss"); +} + - (void)testDownloadedFeeds { NSError *error = nil; int i = 0; diff --git a/RSXMLTests/Resources/broken.rss b/RSXMLTests/Resources/broken.rss new file mode 100644 index 0000000..d91a609 --- /dev/null +++ b/RSXMLTests/Resources/broken.rss @@ -0,0 +1,26 @@ + + + + Manton Reece + + http://www.manton.org + + Fri, 25 Sep 2015 14:26:40 +0000 + en-US + hourly + 1 + http://wordpress.org/?v=4.2.5 + + + http://www.manton.org/2015/09/3071.html + http://www.manton.org/2015/09/3071.html#comments + Fri, + + \ No newline at end of file diff --git a/RSXMLTests/Resources/lower-ascii.rss b/RSXMLTests/Resources/lower-ascii.rss new file mode 100644 index 0000000..d5dadfb --- /dev/null +++ b/RSXMLTests/Resources/lower-ascii.rss @@ -0,0 +1,30 @@ + + + Feed Title + + 1 + http://someurl.com/1/ + + + + 2 + http://someurl.com/2/ + + + + 3 + http://someurl.com/3/ + + + + 4 + http://someurl.com/4/ + + + + 5 + http://someurl.com/5/ + + + + \ No newline at end of file