Refactoring to v.2.0

This commit is contained in:
relikd
2018-12-27 21:11:59 +01:00
parent f9e672661a
commit 62c5bef463
50 changed files with 2574 additions and 3128 deletions

View File

@@ -1,28 +1,212 @@
//
// RSXMLData.m
// RSXML
// MIT License (MIT)
//
// Created by Brent Simmons on 8/24/15.
// Copyright © 2015 Ranchero Software, LLC. All rights reserved.
// Copyright (c) 2016 Brent Simmons
// Copyright (c) 2018 Oleg Geier
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is furnished to do
// so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#import "RSXMLData.h"
#import "RSXMLError.h"
// Parser classes
#import "RSRSSParser.h"
#import "RSAtomParser.h"
#import "RSOPMLParser.h"
#import "RSHTMLMetadataParser.h"
@implementation RSXMLData
static const NSUInteger minNumberOfBytesToSearch = 20;
static const NSInteger numberOfCharactersToSearch = 4096;
- (instancetype)initWithData:(NSData *)data urlString:(NSString *)urlString {
self = [super init];
if (!self) {
return nil;
if (self) {
_data = data;
_urlString = urlString;
_parserError = nil;
_parserClass = [self determineParserClass]; // will set error
if (!_parserClass && _parserError)
_data = nil;
}
_data = data;
_urlString = urlString;
return self;
}
/**
Get location of @c str in data. May be inaccurate since UTF8 uses multi-byte characters.
*/
- (NSInteger)findCString:(const char*)str {
char *foundStr = strnstr(_data.bytes, str, numberOfCharactersToSearch);
if (foundStr == NULL) {
return NSNotFound;
}
return foundStr - (char*)_data.bytes;
}
/**
@return @c YES if any of the provided tags is found within the first 4096 bytes.
*/
- (BOOL)matchAny:(const char*[])tags count:(int)len {
for (int i = 0; i < len; i++) {
if ([self findCString:tags[i]] != NSNotFound) {
return YES;
}
}
return NO;
}
/**
@return @c YES if all of the provided tags are found within the first 4096 bytes.
*/
- (BOOL)matchAll:(const char*[])tags count:(int)len {
for (int i = 0; i < len; i++) {
if ([self findCString:tags[i]] == NSNotFound) {
return NO;
}
}
return YES;
}
/**
Do a fast @c strnstr() search on the @c char* data.
All strings must match exactly and in the same order provided.
*/
- (BOOL)matchAllInCorrectOrder:(const char*[])tags count:(int)len {
NSInteger oldPos = 0;
for (int i = 0; i < len; i++) {
NSInteger newPos = [self findCString:tags[i]];
if (newPos == NSNotFound || newPos < oldPos) {
return NO;
}
oldPos = newPos;
}
return YES;
}
#pragma mark - Determine XML Parser
/**
Try to find the correct parser for the underlying data. Will return @c nil and @c error if couldn't be determined.
@return Parser class: @c RSRSSParser, @c RSAtomParser, @c RSOPMLParser or @c RSHTMLMetadataParser.
*/
- (nullable Class)determineParserClass {
// TODO: check for things like images and movies and return nil.
if (!_data || _data.length < minNumberOfBytesToSearch) {
// TODO: check size, type, etc.
_parserError = RSXMLMakeError(RSXMLErrorNoData);
return nil;
}
if (NSNotFound == [self findCString:"<"]) {
_parserError = RSXMLMakeError(RSXMLErrorMissingLeftCaret);
return nil;
}
if ([self matchAll:(const char*[]){"<rss", "<channel"} count:2]) { // RSS
return [RSRSSParser class];
}
if ([self matchAll:(const char*[]){"<feed", "<entry"} count:2]) { // Atom
return [RSAtomParser class];
}
if (NSNotFound != [self findCString:"<rdf:RDF"]) {
return [RSRSSParser class]; //TODO: parse RDF feeds ... for now, use RSS parser.
}
if ([self matchAll:(const char*[]){"<opml", "<outline"} count:2]) {
return [RSOPMLParser class];
}
if ([self matchAny:(const char*[]){"<html", "<HTML", "<body", "<meta", "doctype html", "DOCTYPE html", "DOCTYPE HTML"} count:7]) {
// Wont catch every single case, which is fine.
return [RSHTMLMetadataParser class];
}
if ([self findCString:"<errors xmlns='http://schemas.google"] != NSNotFound) {
_parserError = RSXMLMakeError(RSXMLErrorContainsXMLErrorsTag);
return nil;
}
// else: try slower NSString conversion and search case insensitive.
return [self determineParserClassSafeAndSlow];
}
/**
Create @c NSString object from @c .data and try to parse it as UTF8 and UTF16.
Then search for each parser if the tags match (case insensitive) in the same order provided.
*/
- (nullable Class)determineParserClassSafeAndSlow {
@autoreleasepool {
NSString *s = [[NSString alloc] initWithBytesNoCopy:(void *)_data.bytes length:_data.length encoding:NSUTF8StringEncoding freeWhenDone:NO];
if (!s) {
s = [[NSString alloc] initWithBytesNoCopy:(void *)_data.bytes length:_data.length encoding:NSUnicodeStringEncoding freeWhenDone:NO];
}
if (!s) {
_parserError = RSXMLMakeError(RSXMLErrorNoSuitableParser);
return nil;
}
NSRange rangeToSearch = NSMakeRange(0, numberOfCharactersToSearch);
if (s.length < numberOfCharactersToSearch) {
rangeToSearch.length = s.length;
}
for (Class parserClass in [self listOfParserClasses]) {
NSArray<const NSString *> *tags = [parserClass parserRequireOrderedTags];
NSUInteger oldPos = 0;
for (NSString *tag in tags) {
NSUInteger newPos = [s rangeOfString:tag options:NSCaseInsensitiveSearch range:rangeToSearch].location;
if (newPos == NSNotFound || newPos < oldPos) {
oldPos = NSNotFound;
break;
}
oldPos = newPos;
}
if (oldPos != NSNotFound) {
return parserClass;
}
}
}
// Try RSS anyway? libxml would return a parsing error
_parserError = RSXMLMakeError(RSXMLErrorNoSuitableParser);
return nil;
}
/// @return List of parsers. @c RSRSSParser, @c RSAtomParser, @c RSOPMLParser.
- (NSArray *)listOfParserClasses {
static NSArray *gParserClasses = nil;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
gParserClasses = @[[RSRSSParser class], [RSAtomParser class], [RSOPMLParser class]];
});
return gParserClasses;
}
#pragma mark - Check Methods to Determine Parser Type
/// @return Kind of @c RSXMLParser or @c nil if no suitable parser found.
- (id)getParser {
return [[_parserClass alloc] initWithXMLData:self];
}
/// @return @c YES if any parser, regardless of type, is suitable.
- (BOOL)canParseData {
return (_parserClass != nil && _parserError == nil);
}
@end