Improved HTML tag removal

This commit is contained in:
relikd
2019-08-12 18:56:52 +02:00
parent 202005eb0d
commit 5ff1753858
7 changed files with 158 additions and 20 deletions

View File

@@ -30,6 +30,7 @@ and this project does adhere to [Semantic Versioning](https://semver.org/spec/v2
- *UI:* If an error occurs, show document URL (path to file or web url)
- Comparison of existing articles with nonexistent guid and link
- Don't mark articles read if opening URLs failed
- HTML tag removal keeps structure intact
### Changed
- *UI:* Interface builder files replaced with code equivalent

View File

@@ -32,6 +32,7 @@
54ACC29521061E270020715F /* UpdateScheduler.m in Sources */ = {isa = PBXBuildFile; fileRef = 54ACC29421061E270020715F /* UpdateScheduler.m */; };
54ACC29821061FBA0020715F /* Preferences.m in Sources */ = {isa = PBXBuildFile; fileRef = 54ACC29721061FBA0020715F /* Preferences.m */; };
54AD4E0023005297000AE386 /* WebFeed.m in Sources */ = {isa = PBXBuildFile; fileRef = 54AD4DFF23005297000AE386 /* WebFeed.m */; };
54AD4E0C2301853D000AE386 /* NSString+Ext.m in Sources */ = {isa = PBXBuildFile; fileRef = 54AD4E0B2301853D000AE386 /* NSString+Ext.m */; };
54B51704226DC339006C1B29 /* ModalFeedEditView.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B51703226DC339006C1B29 /* ModalFeedEditView.m */; };
54B517072270E990006C1B29 /* NSView+Ext.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B517062270E92A006C1B29 /* NSView+Ext.m */; };
54B749DA2204A85C0022CC6D /* BarStatusItem.m in Sources */ = {isa = PBXBuildFile; fileRef = 54B749D92204A85C0022CC6D /* BarStatusItem.m */; };
@@ -142,6 +143,8 @@
54ACC29721061FBA0020715F /* Preferences.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Preferences.m; sourceTree = "<group>"; };
54AD4DFE23005297000AE386 /* WebFeed.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = WebFeed.h; sourceTree = "<group>"; };
54AD4DFF23005297000AE386 /* WebFeed.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = WebFeed.m; sourceTree = "<group>"; };
54AD4E0A2301853D000AE386 /* NSString+Ext.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "NSString+Ext.h"; sourceTree = "<group>"; };
54AD4E0B2301853D000AE386 /* NSString+Ext.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = "NSString+Ext.m"; sourceTree = "<group>"; };
54B51702226DC339006C1B29 /* ModalFeedEditView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ModalFeedEditView.h; sourceTree = "<group>"; };
54B51703226DC339006C1B29 /* ModalFeedEditView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ModalFeedEditView.m; sourceTree = "<group>"; };
54B517052270E8C6006C1B29 /* NSView+Ext.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "NSView+Ext.h"; sourceTree = "<group>"; };
@@ -208,6 +211,8 @@
54BB048821FD2AB500C303A5 /* NSDate+Ext.m */,
54B517052270E8C6006C1B29 /* NSView+Ext.h */,
54B517062270E92A006C1B29 /* NSView+Ext.m */,
54AD4E0A2301853D000AE386 /* NSString+Ext.h */,
54AD4E0B2301853D000AE386 /* NSString+Ext.m */,
);
path = Helper;
sourceTree = "<group>";
@@ -506,6 +511,7 @@
files = (
54AD4E0023005297000AE386 /* WebFeed.m in Sources */,
54B51704226DC339006C1B29 /* ModalFeedEditView.m in Sources */,
54AD4E0C2301853D000AE386 /* NSString+Ext.m in Sources */,
546A6A2C22C584AF0034E806 /* SettingsAppearanceView.m in Sources */,
54E9CF32225914300023696F /* SettingsAbout.m in Sources */,
54B749E0220636200022CC6D /* FeedArticle+Ext.m in Sources */,

View File

@@ -25,6 +25,7 @@
#import "Constants.h"
#import "UserPrefs.h"
#import "StoreCoordinator.h"
#import "NSString+Ext.h"
@implementation FeedArticle (Ext)
@@ -34,11 +35,10 @@
fa.unread = YES;
fa.guid = entry.guid;
fa.title = entry.title;
if (entry.abstract.length > 0) { // remove html tags and save plain text to db
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"<[^>]*>" options:kNilOptions error:nil];
fa.abstract = [regex stringByReplacingMatchesInString:entry.abstract options:kNilOptions range:NSMakeRange(0, entry.abstract.length) withTemplate:@""];
}
fa.body = entry.body;
if (entry.abstract.length > 0)
fa.abstract = [entry.abstract htmlToPlainText];
if (entry.body.length > 0)
fa.body = [entry.body htmlToPlainText];
fa.author = entry.author;
fa.link = entry.link;
fa.published = entry.datePublished;

View File

@@ -28,6 +28,8 @@
#import "FeedMeta+Ext.h"
#import "FeedGroup+Ext.h"
#import "NSDate+Ext.h"
#import "NSString+Ext.h"
#include <stdatomic.h>
static BOOL _requestsAreUrgent = NO;
@@ -104,7 +106,7 @@ static _Atomic(NSUInteger) _queueSize = 0;
data = nil;
} else if (status >= 500 && status < 600) { // 5xx Server Error
NSString *reason = [NSString stringWithFormat:NSLocalizedString(@"Server HTTP error %ld.\n\n%@", nil),
status, [self extractReadableHTML:data]];
status, [NSString plainTextFromHTMLData:data]];
error = [NSError errorWithDomain:NSURLErrorDomain code:NSURLErrorBadServerResponse userInfo:@{NSLocalizedDescriptionKey: reason}];
data = nil;
}
@@ -112,19 +114,6 @@ static _Atomic(NSUInteger) _queueSize = 0;
}] resume];
}
/// Helper method to extract readable text from HTML
+ (NSString*)extractReadableHTML:(NSData*)data {
NSString *str = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding];
// replace all <tags> with (presumably) non-used character
str = [[NSRegularExpression regularExpressionWithPattern:@"<[^>]*>\\s*" options:kNilOptions error:nil]
stringByReplacingMatchesInString:str options:kNilOptions range:NSMakeRange(0, str.length) withTemplate:@"◊"];
// then replace multiple occurences of that character with a single new line
str = [[NSRegularExpression regularExpressionWithPattern:@"◊+" options:kNilOptions error:nil]
stringByReplacingMatchesInString:str options:kNilOptions range:NSMakeRange(0, str.length) withTemplate:@"\n"];
// finally trim whitespace at start and end
return [str stringByTrimmingCharactersInSet: NSCharacterSet.whitespaceAndNewlineCharacterSet];
}
#pragma mark - Download RSS Feed

View File

@@ -0,0 +1,28 @@
//
// The MIT License (MIT)
// Copyright (c) 2019 Oleg Geier
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is furnished to do
// so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
@import Cocoa;
@interface NSString (Ext)
+ (NSString*)plainTextFromHTMLData:(NSData*)data;
- (nonnull NSString*)htmlToPlainText;
@end

114
baRSS/Helper/NSString+Ext.m Normal file
View File

@@ -0,0 +1,114 @@
//
// The MIT License (MIT)
// Copyright (c) 2019 Oleg Geier
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is furnished to do
// so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#import "NSString+Ext.h"
@implementation NSString (Ext)
/// Init string with @c NSUTF8StringEncoding and call @c htmlToPlainText
+ (NSString*)plainTextFromHTMLData:(NSData*)data {
if (!data) return nil;
return [[[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding] htmlToPlainText];
}
/**
Simple HTML parser to extract TEXT elements and semi-structured elements like list items.
Ignores @c <head> , @c <style> and @c <script> tags.
*/
- (nonnull NSString*)htmlToPlainText {
NSScanner *scanner = [NSScanner scannerWithString:self];
scanner.charactersToBeSkipped = NSCharacterSet.newlineCharacterSet; // ! else, some spaces are dropped
NSCharacterSet *angleBrackets = [NSCharacterSet characterSetWithCharactersInString:@"<>"];
unichar prev = '>';
int order = 0; // ul & ol
NSString *skip = nil; // head, style, script
NSMutableString *result = [NSMutableString stringWithString:@" "];
while ([scanner isAtEnd] == NO) {
NSString *tag = nil;
if ([scanner scanUpToCharactersFromSet:angleBrackets intoString:&tag]) {
// parse html tag depending on type
if (prev == '<') {
if (skip) {
// skip everything between <head>, <style>, and <script> tags
if (CLOSE(tag, skip))
skip = nil;
continue;
}
if (OPEN(tag, @"a")) [result appendString:@" "];
else if (OPEN(tag, @"head")) skip = @"/head";
else if (OPEN(tag, @"style")) skip = @"/style";
else if (OPEN(tag, @"script")) skip = @"/script";
else if (CLOSE(tag, @"/p") || OPEN(tag, @"label") || OPEN(tag, @"br"))
[result appendString:@"\n"];
else if (OPEN(tag, @"h1") || OPEN(tag, @"h2") || OPEN(tag, @"h3") ||
OPEN(tag, @"h4") || OPEN(tag, @"h5") || OPEN(tag, @"h6") ||
CLOSE(tag, @"/h1") || CLOSE(tag, @"/h2") || CLOSE(tag, @"/h3") ||
CLOSE(tag, @"/h4") || CLOSE(tag, @"/h5") || CLOSE(tag, @"/h6"))
[result appendString:@"\n"];
else if (OPEN(tag, @"ol")) order = 1;
else if (OPEN(tag, @"ul")) order = 0;
else if (OPEN(tag, @"li")) {
// ordered and unordered list items
unichar last = [result characterAtIndex:result.length - 1];
if (last != '\n') {
[result appendString:@"\n"];
}
if (order > 0) [result appendFormat:@" %d. ", order++];
else [result appendString:@" • "];
}
} else {
// append text inbetween tags
if (!skip) {
[result appendString:tag];
}
}
}
if (![scanner isAtEnd]) {
unichar next = [self characterAtIndex:scanner.scanLocation];
if (prev == next) {
if (!skip)
[result appendFormat:@"%c", prev];
}
prev = next;
++scanner.scanLocation;
}
}
// collapsing multiple horizontal whitespaces (\h) into one (the first one)
[[NSRegularExpression regularExpressionWithPattern:@"(\\h)[\\h]+" options:0 error:nil]
replaceMatchesInString:result options:0 range:NSMakeRange(0, result.length) withTemplate:@"$1"];
return [result stringByTrimmingCharactersInSet:NSCharacterSet.whitespaceAndNewlineCharacterSet];
}
#pragma mark - Helper methods
NS_INLINE BOOL OPEN(NSString *tag, NSString *match) {
return ([tag isEqualToString:match] || [tag hasPrefix:[match stringByAppendingString:@" "]]);
}
NS_INLINE BOOL CLOSE(NSString *tag, NSString *match) {
return [tag isEqualToString:match];
}
@end

View File

@@ -60,7 +60,7 @@
</dict>
</array>
<key>CFBundleVersion</key>
<string>10386</string>
<string>10678</string>
<key>LSApplicationCategoryType</key>
<string>public.app-category.news</string>
<key>LSMinimumSystemVersion</key>