// // MastodonStatusContent.swift // Mastodon // // Created by MainasuK Cirno on 2021/2/1. // import Foundation import Kanna import ActiveLabel enum MastodonStatusContent { typealias EmojiShortcode = String typealias EmojiDict = [EmojiShortcode: URL] static func parse(content: String, emojiDict: EmojiDict) throws -> MastodonStatusContent.ParseResult { let document: String = { var content = content for (shortcode, url) in emojiDict { let emojiNode = "\(shortcode)" let pattern = ":\(shortcode):" content = content.replacingOccurrences(of: pattern, with: emojiNode) } return content.trimmingCharacters(in: .whitespacesAndNewlines) }() let rootNode = try Node.parse(document: document) let text = String(rootNode.text) var activeEntities: [ActiveEntity] = [] let entities = MastodonStatusContent.Node.entities(in: rootNode) for entity in entities { let range = NSRange(entity.text.startIndex.. String { guard self.hasPrefix(prefix) else { return self } return String(self.dropFirst(prefix.count)) } } extension MastodonStatusContent { struct ParseResult { let document: String let original: String let trimmed: String let activeEntities: [ActiveEntity] } } extension MastodonStatusContent { class Node { let level: Int let type: Type? // substring text let text: Substring // range in parent String var range: Range { return text.startIndex.. let href: String? let hrefEllipsis: String? let children: [Node] init( level: Int, text: Substring, tagName: String?, className: String?, href: String?, hrefEllipsis: String?, children: [Node] ) { let _classNames: Set = { guard let className = className else { return Set() } return Set(className.components(separatedBy: " ")) }() let _type: Type? = { if tagName == "a" && !_classNames.contains("mention") { return .url } if _classNames.contains("mention") { if _classNames.contains("u-url") { return .mention } else if _classNames.contains("hashtag") { return .hashtag } } if _classNames.contains("emoji") { return .emoji } return nil }() self.level = level self.type = _type self.text = text self.tagName = tagName self.classNames = _classNames self.href = href self.hrefEllipsis = hrefEllipsis self.children = children } static func parse(document: String) throws -> MastodonStatusContent.Node { let html = try HTML(html: document, encoding: .utf8) // add `\r\n` explicit due to Kanna text missing it after convert to text // ref: https://github.com/tid-kijyun/Kanna/issues/150 let brNodes = html.css("br").makeIterator() while let brNode = brNodes.next() { brNode.addNextSibling(try! HTML(html: "\r\n", encoding: .utf8).body!) } let body = html.body ?? nil let text = body?.text ?? "" let level = 0 let children: [MastodonStatusContent.Node] = body.flatMap { body in return Node.parse(element: body, parentText: text[...], parentLevel: level + 1) } ?? [] let node = Node( level: level, text: text[...], tagName: body?.tagName, className: body?.className, href: nil, hrefEllipsis: nil, children: children ) return node } static func parse(element: XMLElement, parentText: Substring, parentLevel: Int) -> [Node] { let parent = element let scanner = Scanner(string: String(parentText)) scanner.charactersToBeSkipped = .none var element = parent.at_css(":first-child") var children: [Node] = [] while let _element = element { let _text = _element.text ?? "" // scan element text _ = scanner.scanUpToString(_text) let startIndexOffset = scanner.currentIndex.utf16Offset(in: scanner.string) guard scanner.scanString(_text) != nil else { assertionFailure() continue } let endIndexOffset = scanner.currentIndex.utf16Offset(in: scanner.string) // locate substring let startIndex = parentText.utf16.index(parentText.utf16.startIndex, offsetBy: startIndexOffset) let endIndex = parentText.utf16.index(parentText.utf16.startIndex, offsetBy: endIndexOffset) let text = Substring(parentText.utf16[startIndex.. Bool ) -> [Node] { var nodes: [Node] = [] if predicate(node) { nodes.append(node) } for child in node.children { nodes.append(contentsOf: Node.collect(node: child, where: predicate)) } return nodes } } } extension MastodonStatusContent.Node { enum `Type` { case url case mention case hashtag case emoji } static func entities(in node: MastodonStatusContent.Node) -> [MastodonStatusContent.Node] { return MastodonStatusContent.Node.collect(node: node) { node in node.type != nil } } static func hashtags(in node: MastodonStatusContent.Node) -> [MastodonStatusContent.Node] { return MastodonStatusContent.Node.collect(node: node) { node in node.type == .hashtag } } static func mentions(in node: MastodonStatusContent.Node) -> [MastodonStatusContent.Node] { return MastodonStatusContent.Node.collect(node: node) { node in node.type == .mention } } static func urls(in node: MastodonStatusContent.Node) -> [MastodonStatusContent.Node] { return MastodonStatusContent.Node.collect(node: node) { node in node.type == .url } } } extension MastodonStatusContent.Node: CustomDebugStringConvertible { var debugDescription: String { let linkInfo: String = { switch (href, hrefEllipsis) { case (nil, nil): return "" case (let href, let hrefEllipsis): return "(\(href ?? "nil") - \(hrefEllipsis ?? "nil"))" } }() let classNamesInfo: String = { guard !classNames.isEmpty else { return "" } let names = Array(classNames) .sorted() .joined(separator: ", ") return "@[\(names)]" }() let nodeDescription = String( format: "<%@>%@%@: %@", tagName ?? "", classNamesInfo, linkInfo, String(text) ) guard !children.isEmpty else { return nodeDescription } let indent = Array(repeating: " ", count: level).joined() let childrenDescription = children .map { indent + $0.debugDescription } .joined(separator: "\n") return nodeDescription + "\n" + childrenDescription } }