Improved HTML tag removal
This commit is contained in:
@@ -30,6 +30,7 @@ and this project does adhere to [Semantic Versioning](https://semver.org/spec/v2
|
||||
- *UI:* If an error occurs, show document URL (path to file or web url)
|
||||
- Comparison of existing articles with nonexistent guid and link
|
||||
- Don't mark articles read if opening URLs failed
|
||||
- HTML tag removal keeps structure intact
|
||||
|
||||
### Changed
|
||||
- *UI:* Interface builder files replaced with code equivalent
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
54ACC29521061E270020715F /* UpdateScheduler.m in Sources */ = {isa = PBXBuildFile; fileRef = 54ACC29421061E270020715F /* UpdateScheduler.m */; };
|
||||
54ACC29821061FBA0020715F /* Preferences.m in Sources */ = {isa = PBXBuildFile; fileRef = 54ACC29721061FBA0020715F /* Preferences.m */; };
|
||||
54AD4E0023005297000AE386 /* WebFeed.m in Sources */ = {isa = PBXBuildFile; fileRef = 54AD4DFF23005297000AE386 /* WebFeed.m */; };
|
||||
54AD4E0C2301853D000AE386 /* NSString+Ext.m in Sources */ = {isa = PBXBuildFile; fileRef = 54AD4E0B2301853D000AE386 /* NSString+Ext.m */; };
|
||||
54B51704226DC339006C1B29 /* ModalFeedEditView.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B51703226DC339006C1B29 /* ModalFeedEditView.m */; };
|
||||
54B517072270E990006C1B29 /* NSView+Ext.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B517062270E92A006C1B29 /* NSView+Ext.m */; };
|
||||
54B749DA2204A85C0022CC6D /* BarStatusItem.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B749D92204A85C0022CC6D /* BarStatusItem.m */; };
|
||||
@@ -142,6 +143,8 @@
|
||||
54ACC29721061FBA0020715F /* Preferences.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Preferences.m; sourceTree = "<group>"; };
|
||||
54AD4DFE23005297000AE386 /* WebFeed.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = WebFeed.h; sourceTree = "<group>"; };
|
||||
54AD4DFF23005297000AE386 /* WebFeed.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = WebFeed.m; sourceTree = "<group>"; };
|
||||
54AD4E0A2301853D000AE386 /* NSString+Ext.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "NSString+Ext.h"; sourceTree = "<group>"; };
|
||||
54AD4E0B2301853D000AE386 /* NSString+Ext.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = "NSString+Ext.m"; sourceTree = "<group>"; };
|
||||
54B51702226DC339006C1B29 /* ModalFeedEditView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ModalFeedEditView.h; sourceTree = "<group>"; };
|
||||
54B51703226DC339006C1B29 /* ModalFeedEditView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ModalFeedEditView.m; sourceTree = "<group>"; };
|
||||
54B517052270E8C6006C1B29 /* NSView+Ext.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "NSView+Ext.h"; sourceTree = "<group>"; };
|
||||
@@ -208,6 +211,8 @@
|
||||
54BB048821FD2AB500C303A5 /* NSDate+Ext.m */,
|
||||
54B517052270E8C6006C1B29 /* NSView+Ext.h */,
|
||||
54B517062270E92A006C1B29 /* NSView+Ext.m */,
|
||||
54AD4E0A2301853D000AE386 /* NSString+Ext.h */,
|
||||
54AD4E0B2301853D000AE386 /* NSString+Ext.m */,
|
||||
);
|
||||
path = Helper;
|
||||
sourceTree = "<group>";
|
||||
@@ -506,6 +511,7 @@
|
||||
files = (
|
||||
54AD4E0023005297000AE386 /* WebFeed.m in Sources */,
|
||||
54B51704226DC339006C1B29 /* ModalFeedEditView.m in Sources */,
|
||||
54AD4E0C2301853D000AE386 /* NSString+Ext.m in Sources */,
|
||||
546A6A2C22C584AF0034E806 /* SettingsAppearanceView.m in Sources */,
|
||||
54E9CF32225914300023696F /* SettingsAbout.m in Sources */,
|
||||
54B749E0220636200022CC6D /* FeedArticle+Ext.m in Sources */,
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#import "Constants.h"
|
||||
#import "UserPrefs.h"
|
||||
#import "StoreCoordinator.h"
|
||||
#import "NSString+Ext.h"
|
||||
|
||||
@implementation FeedArticle (Ext)
|
||||
|
||||
@@ -34,11 +35,10 @@
|
||||
fa.unread = YES;
|
||||
fa.guid = entry.guid;
|
||||
fa.title = entry.title;
|
||||
if (entry.abstract.length > 0) { // remove html tags and save plain text to db
|
||||
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"<[^>]*>" options:kNilOptions error:nil];
|
||||
fa.abstract = [regex stringByReplacingMatchesInString:entry.abstract options:kNilOptions range:NSMakeRange(0, entry.abstract.length) withTemplate:@""];
|
||||
}
|
||||
fa.body = entry.body;
|
||||
if (entry.abstract.length > 0)
|
||||
fa.abstract = [entry.abstract htmlToPlainText];
|
||||
if (entry.body.length > 0)
|
||||
fa.body = [entry.body htmlToPlainText];
|
||||
fa.author = entry.author;
|
||||
fa.link = entry.link;
|
||||
fa.published = entry.datePublished;
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#import "FeedMeta+Ext.h"
|
||||
#import "FeedGroup+Ext.h"
|
||||
#import "NSDate+Ext.h"
|
||||
#import "NSString+Ext.h"
|
||||
|
||||
#include <stdatomic.h>
|
||||
|
||||
static BOOL _requestsAreUrgent = NO;
|
||||
@@ -104,7 +106,7 @@ static _Atomic(NSUInteger) _queueSize = 0;
|
||||
data = nil;
|
||||
} else if (status >= 500 && status < 600) { // 5xx Server Error
|
||||
NSString *reason = [NSString stringWithFormat:NSLocalizedString(@"Server HTTP error %ld.\n––––\n%@", nil),
|
||||
status, [self extractReadableHTML:data]];
|
||||
status, [NSString plainTextFromHTMLData:data]];
|
||||
error = [NSError errorWithDomain:NSURLErrorDomain code:NSURLErrorBadServerResponse userInfo:@{NSLocalizedDescriptionKey: reason}];
|
||||
data = nil;
|
||||
}
|
||||
@@ -112,19 +114,6 @@ static _Atomic(NSUInteger) _queueSize = 0;
|
||||
}] resume];
|
||||
}
|
||||
|
||||
/// Helper method to extract readable text from HTML
|
||||
+ (NSString*)extractReadableHTML:(NSData*)data {
|
||||
NSString *str = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding];
|
||||
// replace all <tags> with (presumably) non-used character
|
||||
str = [[NSRegularExpression regularExpressionWithPattern:@"<[^>]*>\\s*" options:kNilOptions error:nil]
|
||||
stringByReplacingMatchesInString:str options:kNilOptions range:NSMakeRange(0, str.length) withTemplate:@"◊"];
|
||||
// then replace multiple occurences of that character with a single new line
|
||||
str = [[NSRegularExpression regularExpressionWithPattern:@"◊+" options:kNilOptions error:nil]
|
||||
stringByReplacingMatchesInString:str options:kNilOptions range:NSMakeRange(0, str.length) withTemplate:@"\n"];
|
||||
// finally trim whitespace at start and end
|
||||
return [str stringByTrimmingCharactersInSet: NSCharacterSet.whitespaceAndNewlineCharacterSet];
|
||||
}
|
||||
|
||||
|
||||
#pragma mark - Download RSS Feed
|
||||
|
||||
|
||||
28
baRSS/Helper/NSString+Ext.h
Normal file
28
baRSS/Helper/NSString+Ext.h
Normal file
@@ -0,0 +1,28 @@
|
||||
//
|
||||
// The MIT License (MIT)
|
||||
// Copyright (c) 2019 Oleg Geier
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
// this software and associated documentation files (the "Software"), to deal in
|
||||
// the Software without restriction, including without limitation the rights to
|
||||
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
// of the Software, and to permit persons to whom the Software is furnished to do
|
||||
// so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
@import Cocoa;
|
||||
|
||||
@interface NSString (Ext)
|
||||
+ (NSString*)plainTextFromHTMLData:(NSData*)data;
|
||||
- (nonnull NSString*)htmlToPlainText;
|
||||
@end
|
||||
114
baRSS/Helper/NSString+Ext.m
Normal file
114
baRSS/Helper/NSString+Ext.m
Normal file
@@ -0,0 +1,114 @@
|
||||
//
|
||||
// The MIT License (MIT)
|
||||
// Copyright (c) 2019 Oleg Geier
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
// this software and associated documentation files (the "Software"), to deal in
|
||||
// the Software without restriction, including without limitation the rights to
|
||||
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
// of the Software, and to permit persons to whom the Software is furnished to do
|
||||
// so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#import "NSString+Ext.h"
|
||||
|
||||
@implementation NSString (Ext)
|
||||
|
||||
/// Init string with @c NSUTF8StringEncoding and call @c htmlToPlainText
|
||||
+ (NSString*)plainTextFromHTMLData:(NSData*)data {
|
||||
if (!data) return nil;
|
||||
return [[[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding] htmlToPlainText];
|
||||
}
|
||||
|
||||
/**
|
||||
Simple HTML parser to extract TEXT elements and semi-structured elements like list items.
|
||||
Ignores @c <head> , @c <style> and @c <script> tags.
|
||||
*/
|
||||
- (nonnull NSString*)htmlToPlainText {
|
||||
NSScanner *scanner = [NSScanner scannerWithString:self];
|
||||
scanner.charactersToBeSkipped = NSCharacterSet.newlineCharacterSet; // ! else, some spaces are dropped
|
||||
NSCharacterSet *angleBrackets = [NSCharacterSet characterSetWithCharactersInString:@"<>"];
|
||||
unichar prev = '>';
|
||||
int order = 0; // ul & ol
|
||||
NSString *skip = nil; // head, style, script
|
||||
|
||||
NSMutableString *result = [NSMutableString stringWithString:@" "];
|
||||
while ([scanner isAtEnd] == NO) {
|
||||
NSString *tag = nil;
|
||||
if ([scanner scanUpToCharactersFromSet:angleBrackets intoString:&tag]) {
|
||||
// parse html tag depending on type
|
||||
if (prev == '<') {
|
||||
if (skip) {
|
||||
// skip everything between <head>, <style>, and <script> tags
|
||||
if (CLOSE(tag, skip))
|
||||
skip = nil;
|
||||
continue;
|
||||
}
|
||||
if (OPEN(tag, @"a")) [result appendString:@" "];
|
||||
else if (OPEN(tag, @"head")) skip = @"/head";
|
||||
else if (OPEN(tag, @"style")) skip = @"/style";
|
||||
else if (OPEN(tag, @"script")) skip = @"/script";
|
||||
else if (CLOSE(tag, @"/p") || OPEN(tag, @"label") || OPEN(tag, @"br"))
|
||||
[result appendString:@"\n"];
|
||||
else if (OPEN(tag, @"h1") || OPEN(tag, @"h2") || OPEN(tag, @"h3") ||
|
||||
OPEN(tag, @"h4") || OPEN(tag, @"h5") || OPEN(tag, @"h6") ||
|
||||
CLOSE(tag, @"/h1") || CLOSE(tag, @"/h2") || CLOSE(tag, @"/h3") ||
|
||||
CLOSE(tag, @"/h4") || CLOSE(tag, @"/h5") || CLOSE(tag, @"/h6"))
|
||||
[result appendString:@"\n"];
|
||||
else if (OPEN(tag, @"ol")) order = 1;
|
||||
else if (OPEN(tag, @"ul")) order = 0;
|
||||
else if (OPEN(tag, @"li")) {
|
||||
// ordered and unordered list items
|
||||
unichar last = [result characterAtIndex:result.length - 1];
|
||||
if (last != '\n') {
|
||||
[result appendString:@"\n"];
|
||||
}
|
||||
if (order > 0) [result appendFormat:@" %d. ", order++];
|
||||
else [result appendString:@" • "];
|
||||
}
|
||||
} else {
|
||||
// append text inbetween tags
|
||||
if (!skip) {
|
||||
[result appendString:tag];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (![scanner isAtEnd]) {
|
||||
unichar next = [self characterAtIndex:scanner.scanLocation];
|
||||
if (prev == next) {
|
||||
if (!skip)
|
||||
[result appendFormat:@"%c", prev];
|
||||
}
|
||||
prev = next;
|
||||
++scanner.scanLocation;
|
||||
}
|
||||
}
|
||||
// collapsing multiple horizontal whitespaces (\h) into one (the first one)
|
||||
[[NSRegularExpression regularExpressionWithPattern:@"(\\h)[\\h]+" options:0 error:nil]
|
||||
replaceMatchesInString:result options:0 range:NSMakeRange(0, result.length) withTemplate:@"$1"];
|
||||
return [result stringByTrimmingCharactersInSet:NSCharacterSet.whitespaceAndNewlineCharacterSet];
|
||||
}
|
||||
|
||||
|
||||
#pragma mark - Helper methods
|
||||
|
||||
|
||||
NS_INLINE BOOL OPEN(NSString *tag, NSString *match) {
|
||||
return ([tag isEqualToString:match] || [tag hasPrefix:[match stringByAppendingString:@" "]]);
|
||||
}
|
||||
|
||||
NS_INLINE BOOL CLOSE(NSString *tag, NSString *match) {
|
||||
return [tag isEqualToString:match];
|
||||
}
|
||||
|
||||
@end
|
||||
@@ -60,7 +60,7 @@
|
||||
</dict>
|
||||
</array>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>10386</string>
|
||||
<string>10678</string>
|
||||
<key>LSApplicationCategoryType</key>
|
||||
<string>public.app-category.news</string>
|
||||
<key>LSMinimumSystemVersion</key>
|
||||
|
||||
Reference in New Issue
Block a user