diff --git a/CHANGELOG.md b/CHANGELOG.md index ab20b5a..8e47752 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ and this project does adhere to [Semantic Versioning](https://semver.org/spec/v2 - *UI:* If an error occurs, show document URL (path to file or web url) - Comparison of existing articles with nonexistent guid and link - Don't mark articles read if opening URLs failed +- HTML tag removal keeps structure intact ### Changed - *UI:* Interface builder files replaced with code equivalent diff --git a/baRSS.xcodeproj/project.pbxproj b/baRSS.xcodeproj/project.pbxproj index 06bd2d3..c126378 100644 --- a/baRSS.xcodeproj/project.pbxproj +++ b/baRSS.xcodeproj/project.pbxproj @@ -32,6 +32,7 @@ 54ACC29521061E270020715F /* UpdateScheduler.m in Sources */ = {isa = PBXBuildFile; fileRef = 54ACC29421061E270020715F /* UpdateScheduler.m */; }; 54ACC29821061FBA0020715F /* Preferences.m in Sources */ = {isa = PBXBuildFile; fileRef = 54ACC29721061FBA0020715F /* Preferences.m */; }; 54AD4E0023005297000AE386 /* WebFeed.m in Sources */ = {isa = PBXBuildFile; fileRef = 54AD4DFF23005297000AE386 /* WebFeed.m */; }; + 54AD4E0C2301853D000AE386 /* NSString+Ext.m in Sources */ = {isa = PBXBuildFile; fileRef = 54AD4E0B2301853D000AE386 /* NSString+Ext.m */; }; 54B51704226DC339006C1B29 /* ModalFeedEditView.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B51703226DC339006C1B29 /* ModalFeedEditView.m */; }; 54B517072270E990006C1B29 /* NSView+Ext.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B517062270E92A006C1B29 /* NSView+Ext.m */; }; 54B749DA2204A85C0022CC6D /* BarStatusItem.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B749D92204A85C0022CC6D /* BarStatusItem.m */; }; @@ -142,6 +143,8 @@ 54ACC29721061FBA0020715F /* Preferences.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Preferences.m; sourceTree = ""; }; 54AD4DFE23005297000AE386 /* WebFeed.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = WebFeed.h; sourceTree = ""; }; 54AD4DFF23005297000AE386 /* WebFeed.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = WebFeed.m; sourceTree = ""; }; + 54AD4E0A2301853D000AE386 /* NSString+Ext.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "NSString+Ext.h"; sourceTree = ""; }; + 54AD4E0B2301853D000AE386 /* NSString+Ext.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = "NSString+Ext.m"; sourceTree = ""; }; 54B51702226DC339006C1B29 /* ModalFeedEditView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ModalFeedEditView.h; sourceTree = ""; }; 54B51703226DC339006C1B29 /* ModalFeedEditView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ModalFeedEditView.m; sourceTree = ""; }; 54B517052270E8C6006C1B29 /* NSView+Ext.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "NSView+Ext.h"; sourceTree = ""; }; @@ -208,6 +211,8 @@ 54BB048821FD2AB500C303A5 /* NSDate+Ext.m */, 54B517052270E8C6006C1B29 /* NSView+Ext.h */, 54B517062270E92A006C1B29 /* NSView+Ext.m */, + 54AD4E0A2301853D000AE386 /* NSString+Ext.h */, + 54AD4E0B2301853D000AE386 /* NSString+Ext.m */, ); path = Helper; sourceTree = ""; @@ -506,6 +511,7 @@ files = ( 54AD4E0023005297000AE386 /* WebFeed.m in Sources */, 54B51704226DC339006C1B29 /* ModalFeedEditView.m in Sources */, + 54AD4E0C2301853D000AE386 /* NSString+Ext.m in Sources */, 546A6A2C22C584AF0034E806 /* SettingsAppearanceView.m in Sources */, 54E9CF32225914300023696F /* SettingsAbout.m in Sources */, 54B749E0220636200022CC6D /* FeedArticle+Ext.m in Sources */, diff --git a/baRSS/Core Data/FeedArticle+Ext.m b/baRSS/Core Data/FeedArticle+Ext.m index b77823e..e00a286 100644 --- a/baRSS/Core Data/FeedArticle+Ext.m +++ b/baRSS/Core Data/FeedArticle+Ext.m @@ -25,6 +25,7 @@ #import "Constants.h" #import "UserPrefs.h" #import "StoreCoordinator.h" +#import "NSString+Ext.h" @implementation FeedArticle (Ext) @@ -34,11 +35,10 @@ fa.unread = YES; fa.guid = entry.guid; fa.title = entry.title; - if (entry.abstract.length > 0) { // remove html tags and save plain text to db - NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"<[^>]*>" options:kNilOptions error:nil]; - fa.abstract = [regex stringByReplacingMatchesInString:entry.abstract options:kNilOptions range:NSMakeRange(0, entry.abstract.length) withTemplate:@""]; - } - fa.body = entry.body; + if (entry.abstract.length > 0) + fa.abstract = [entry.abstract htmlToPlainText]; + if (entry.body.length > 0) + fa.body = [entry.body htmlToPlainText]; fa.author = entry.author; fa.link = entry.link; fa.published = entry.datePublished; diff --git a/baRSS/Feed Import/WebFeed.m b/baRSS/Feed Import/WebFeed.m index 3df6e9b..a307c0d 100644 --- a/baRSS/Feed Import/WebFeed.m +++ b/baRSS/Feed Import/WebFeed.m @@ -28,6 +28,8 @@ #import "FeedMeta+Ext.h" #import "FeedGroup+Ext.h" #import "NSDate+Ext.h" +#import "NSString+Ext.h" + #include static BOOL _requestsAreUrgent = NO; @@ -104,7 +106,7 @@ static _Atomic(NSUInteger) _queueSize = 0; data = nil; } else if (status >= 500 && status < 600) { // 5xx Server Error NSString *reason = [NSString stringWithFormat:NSLocalizedString(@"Server HTTP error %ld.\n––––\n%@", nil), - status, [self extractReadableHTML:data]]; + status, [NSString plainTextFromHTMLData:data]]; error = [NSError errorWithDomain:NSURLErrorDomain code:NSURLErrorBadServerResponse userInfo:@{NSLocalizedDescriptionKey: reason}]; data = nil; } @@ -112,19 +114,6 @@ static _Atomic(NSUInteger) _queueSize = 0; }] resume]; } -/// Helper method to extract readable text from HTML -+ (NSString*)extractReadableHTML:(NSData*)data { - NSString *str = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding]; - // replace all with (presumably) non-used character - str = [[NSRegularExpression regularExpressionWithPattern:@"<[^>]*>\\s*" options:kNilOptions error:nil] - stringByReplacingMatchesInString:str options:kNilOptions range:NSMakeRange(0, str.length) withTemplate:@"◊"]; - // then replace multiple occurences of that character with a single new line - str = [[NSRegularExpression regularExpressionWithPattern:@"◊+" options:kNilOptions error:nil] - stringByReplacingMatchesInString:str options:kNilOptions range:NSMakeRange(0, str.length) withTemplate:@"\n"]; - // finally trim whitespace at start and end - return [str stringByTrimmingCharactersInSet: NSCharacterSet.whitespaceAndNewlineCharacterSet]; -} - #pragma mark - Download RSS Feed diff --git a/baRSS/Helper/NSString+Ext.h b/baRSS/Helper/NSString+Ext.h new file mode 100644 index 0000000..79cb81d --- /dev/null +++ b/baRSS/Helper/NSString+Ext.h @@ -0,0 +1,28 @@ +// +// The MIT License (MIT) +// Copyright (c) 2019 Oleg Geier +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +@import Cocoa; + +@interface NSString (Ext) ++ (NSString*)plainTextFromHTMLData:(NSData*)data; +- (nonnull NSString*)htmlToPlainText; +@end diff --git a/baRSS/Helper/NSString+Ext.m b/baRSS/Helper/NSString+Ext.m new file mode 100644 index 0000000..46a2606 --- /dev/null +++ b/baRSS/Helper/NSString+Ext.m @@ -0,0 +1,114 @@ +// +// The MIT License (MIT) +// Copyright (c) 2019 Oleg Geier +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#import "NSString+Ext.h" + +@implementation NSString (Ext) + +/// Init string with @c NSUTF8StringEncoding and call @c htmlToPlainText ++ (NSString*)plainTextFromHTMLData:(NSData*)data { + if (!data) return nil; + return [[[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding] htmlToPlainText]; +} + +/** + Simple HTML parser to extract TEXT elements and semi-structured elements like list items. + Ignores @c , @c