diff --git a/src/app/htmlparser.h b/src/app/htmlparser.h index c656542b..3b23b573 100644 --- a/src/app/htmlparser.h +++ b/src/app/htmlparser.h @@ -39,6 +39,7 @@ public: doc_ = tidyCreate(); tidyOptSetBool(doc_, TidyQuiet, yes); tidyOptSetBool(doc_, TidyShowWarnings, no); + tidyOptSetInt(doc_, TidyUseCustomTags, TidyCustomEmpty); } ~HtmlParser() @@ -51,46 +52,88 @@ public: return tidyParseString(doc_, html.toLocal8Bit().data()) >= 0; } - using TagInfoList = QMap>; + using TagNodeList = QMap>; // A function that traverses the DOM tree and fills a QVariantMap with a list - // of the tags and their values. The result is structured as follows: - // {tagId1: ["tagValue1", "tagValue2", ...], - // tagId: ["tagValue1", "tagValue2", ...], + // of the tags and their nodes. The result is structured as follows: + // {tagId1: [tagNode1, tagNode2, ...], + // tagId2: [tagNode3, tagNode4, ...], // ... } - TagInfoList getTags(QList tags, int maxDepth = -1) + TagNodeList getTagsNodes(const QList& tags, int maxDepth = -1) { - TagInfoList result; + TagNodeList result; traverseNode( tidyGetRoot(doc_), tags, - [&result](const QString& value, TidyTagId tag) { result[tag].append(value); }, + [&result](TidyNode node, TidyTagId tag) { result[tag].append(node); }, maxDepth); return result; } - QString getFirstTagValue(TidyTagId tag, int maxDepth = -1) + // The same as the above function, only it returns the first node for a single tag. + TidyNode getFirstTagNode(TidyTagId tag, int maxDepth = -1) { - QString result; + TidyNode result = nullptr; traverseNode( tidyGetRoot(doc_), {tag}, - [&result](const QString& value, TidyTagId) { result = value; }, + [&result](TidyNode node, TidyTagId) { result = node; }, maxDepth); return result; } -private: - void traverseNode(TidyNode node, - QList tags, - const std::function& cb, - int depth = -1) + // Extract the text value from a node. + QString getNodeText(TidyNode node) { TidyBuffer nodeValue = {}; + if (!node || tidyNodeGetText(doc_, node, &nodeValue) != yes) { + return QString(); + } + QString result = QString::fromUtf8((char*) nodeValue.bp, nodeValue.size); + tidyBufFree(&nodeValue); + return result; + } + + // Extract the attribute value from a node. + QString getNodeAttr(TidyNode node, TidyAttrId attrId) + { + TidyAttr attr = tidyAttrGetById(node, attrId); + if (!attr) { + return QString(); + } + const auto* attrValue = tidyAttrValue(attr); + if (!attrValue) { + return QString(); + } + return QString::fromLocal8Bit(attrValue); + } + + // Extract the inner HTML of a node. + QString getNodeInnerHtml(TidyNode node) + { + if (!node) { + return QString(); + } + const auto* child = tidyGetChild(node); + return child ? getNodeText(child) : QString(); + } + + QString getTagInnerHtml(TidyTagId tag) + { + return getNodeInnerHtml(getFirstTagNode(tag)); + } + +private: + // NOLINTNEXTLINE(misc-no-recursion) + void traverseNode(TidyNode node, + const QList& tags, + const std::function& cb, + int depth = -1) + { for (auto tag : tags) { - if (tidyNodeGetId(node) == tag && tidyNodeGetText(doc_, node, &nodeValue) == yes && cb) { - cb(QString::fromLocal8Bit(nodeValue.bp), tag); + if (tidyNodeGetId(node) == tag && cb) { + cb(node, tag); if (depth != -1 && --depth == 0) { return; } diff --git a/src/app/messageparser.cpp b/src/app/messageparser.cpp index 7941e12d..a501b0b4 100644 --- a/src/app/messageparser.cpp +++ b/src/app/messageparser.cpp @@ -25,6 +25,18 @@ #include "md4c-html.h" +namespace { +// A callback function that will be called by the md4c library (`md_html`) to output the HTML. +void +htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData) +{ + QByteArray* array = static_cast(userData); + if (data_size > 0) { + array->append(data, int(data_size)); + } +}; +} // namespace + MessageParser::MessageParser(PreviewEngine* previewEngine, QObject* parent) : QObject(parent) , previewEngine_(previewEngine) @@ -51,9 +63,9 @@ MessageParser::parseMessage(const QString& messageId, // Now that we have the HTML, we can parse it to get a list of tags and their values. // We are only interested in the and
 tags.
             htmlParser_->parseHtmlString(html);
-            auto tagsMap = htmlParser_->getTags({TidyTag_A, TidyTag_DEL, TidyTag_PRE});
+            auto tagsMap = htmlParser_->getTagsNodes({TidyTag_A, TidyTag_DEL, TidyTag_PRE});
 
-            static QString styleTag("");
+            static const QString styleTag("");
             QString style;
 
             // Check for any 
 tags. If there are any, we need to:
@@ -89,11 +101,9 @@ MessageParser::parseMessage(const QString& messageId,
                 // If the user has enabled link previews, then we need to generate the link preview.
                 if (previewLinks) {
                     // Get the first link in the message.
-                    auto anchorTag = tagsMap[TidyTag_A].first();
-                    static QRegularExpression hrefRegex("href=\"(.*?)\"");
-                    auto match = hrefRegex.match(anchorTag);
-                    if (match.hasMatch()) {
-                        Q_EMIT previewEngine_->parseLink(messageId, match.captured(1));
+                    auto href = htmlParser_->getNodeAttr(tagsMap[TidyTag_A].first(), TidyAttr_HREF);
+                    if (!href.isEmpty()) {
+                        Q_EMIT previewEngine_->parseLink(messageId, href);
                     }
                 }
 
@@ -110,13 +120,13 @@ void
 MessageParser::preprocessMarkdown(QString& markdown)
 {
     // Match all instances of the linefeed character.
-    static QRegularExpression newlineRegex("\n");
+    static const QRegularExpression newlineRegex("\\r?\\n");
     static const QString newline = "  \n";
 
     // Replace all instances of the linefeed character with 2 spaces + a linefeed character
     // in order to force a line break in the HTML.
     // Note: we should only do this for non-code fenced blocks.
-    static QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}");
+    static const QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}");
     auto match = codeFenceRe.globalMatch(markdown);
 
     // If there are no code blocks, then we can just replace all linefeeds with 2 spaces
@@ -132,7 +142,7 @@ MessageParser::preprocessMarkdown(QString& markdown)
     enum BlockType { Text, Code };
     QVector> codeBlocks;
 
-    int start = 0;
+    qsizetype start = 0;
     while (match.hasNext()) {
         auto m = match.next();
         auto nonCodelength = m.capturedStart() - start;
@@ -158,27 +168,16 @@ MessageParser::preprocessMarkdown(QString& markdown)
     }
 }
 
-// A callback function that will be called by the md4c library (`md_html`) to output the HTML.
-static void
-htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData)
-{
-    QByteArray* array = static_cast(userData);
-    if (data_size > 0) {
-        array->append(data, int(data_size));
-    }
-};
-
 QString
 MessageParser::markdownToHtml(const char* markdown)
 {
     static auto md_flags = MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_NOINDENTEDCODEBLOCKS
                            | MD_FLAG_TASKLISTS | MD_FLAG_STRIKETHROUGH | MD_FLAG_UNDERLINE;
-    size_t data_len = strlen(markdown);
+    const size_t data_len = strlen(markdown);
     if (data_len <= 0) {
         return QString();
-    } else {
-        QByteArray array;
-        int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
-        return result == 0 ? QString::fromUtf8(array) : QString();
     }
+    QByteArray array;
+    const int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
+    return result == 0 ? QString::fromUtf8(array) : QString();
 }
diff --git a/src/app/previewengine.cpp b/src/app/previewengine.cpp
index 5f4490f1..5e56fbb8 100644
--- a/src/app/previewengine.cpp
+++ b/src/app/previewengine.cpp
@@ -19,15 +19,6 @@
 
 #include 
 
-static QString
-getInnerHtml(const QString& tag)
-{
-    static const QRegularExpression re(">([^<]+)<");
-    const auto match = re.match(tag);
-    return match.hasMatch() ? match.captured(1) : QString {};
-};
-
-// Portable newline regex.
 const QRegularExpression PreviewEngine::newlineRe("\\r?\\n");
 
 PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
@@ -39,12 +30,11 @@ PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
 }
 
 QString
-PreviewEngine::getTagContent(QList& tags, const QString& value)
+PreviewEngine::getTagContent(const QList& tags, const QString& value)
 {
     Q_FOREACH (auto tag, tags) {
         const QRegularExpression re("(property|name)=\"(og:|twitter:|)" + value
                                     + "\".*?content=\"([^\"]+)\"");
-
         const auto match = re.match(tag.remove(newlineRe));
         if (match.hasMatch()) {
             return match.captured(3);
@@ -54,45 +44,44 @@ PreviewEngine::getTagContent(QList& tags, const QString& value)
 }
 
 QString
-PreviewEngine::getTitle(HtmlParser::TagInfoList& metaTags)
+PreviewEngine::getTitle(const QList& metaTags)
 {
     // Try with opengraph/twitter props
-    QString title = getTagContent(metaTags[TidyTag_META], "title");
+    QString title = getTagContent(metaTags, "title");
     if (title.isEmpty()) { // Try with title tag
-        title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_TITLE));
+        title = htmlParser_->getTagInnerHtml(TidyTag_TITLE);
     }
     if (title.isEmpty()) { // Try with h1 tag
-        title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H1));
+        title = htmlParser_->getTagInnerHtml(TidyTag_H1);
     }
     if (title.isEmpty()) { // Try with h2 tag
-        title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H2));
+        title = htmlParser_->getTagInnerHtml(TidyTag_H2);
     }
     return title;
 }
 
 QString
-PreviewEngine::getDescription(HtmlParser::TagInfoList& metaTags)
+PreviewEngine::getDescription(const QList& metaTags)
 {
     // Try with og/twitter props
-    QString d = getTagContent(metaTags[TidyTag_META], "description");
-    if (d.isEmpty()) { // Try with first paragraph
-        d = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_P));
+    QString desc = getTagContent(metaTags, "description");
+    if (desc.isEmpty()) { // Try with first paragraph
+        desc = htmlParser_->getTagInnerHtml(TidyTag_P);
     }
-    return d;
+    return desc;
 }
 
 QString
-PreviewEngine::getImage(HtmlParser::TagInfoList& metaTags)
+PreviewEngine::getImage(const QList& metaTags)
 {
     // Try with og/twitter props
-    QString image = getTagContent(metaTags[TidyTag_META], "image");
+    QString image = getTagContent(metaTags, "image");
     if (image.isEmpty()) { // Try with href of link tag (rel="image_src")
-        auto tags = htmlParser_->getTags({TidyTag_LINK});
-        Q_FOREACH (auto tag, tags[TidyTag_LINK]) {
-            static const QRegularExpression re("rel=\"image_src\".*?href=\"([^\"]+)\"");
-            const auto match = re.match(tag.remove(newlineRe));
-            if (match.hasMatch()) {
-                return match.captured(1);
+        auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_LINK});
+        Q_FOREACH (auto tag, tagsNodes[TidyTag_LINK]) {
+            QString href = htmlParser_->getNodeAttr(tag, TidyAttr_HREF);
+            if (!href.isEmpty()) {
+                return href;
             }
         }
     }
@@ -104,7 +93,12 @@ PreviewEngine::onParseLink(const QString& messageId, const QString& link)
 {
     sendGetRequest(QUrl(link), [this, messageId, link](const QByteArray& html) {
         htmlParser_->parseHtmlString(html);
-        auto metaTags = htmlParser_->getTags({TidyTag_META});
+        auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_META});
+        auto metaTagNodes = tagsNodes[TidyTag_META];
+        QList metaTags;
+        Q_FOREACH (auto tag, metaTagNodes) {
+            metaTags.append(htmlParser_->getNodeText(tag));
+        }
         QString domain = QUrl(link).host();
         if (domain.isEmpty()) {
             domain = link;
diff --git a/src/app/previewengine.h b/src/app/previewengine.h
index db14a968..2f0144ad 100644
--- a/src/app/previewengine.h
+++ b/src/app/previewengine.h
@@ -39,10 +39,10 @@ private:
     // An instance of HtmlParser used to parse HTML.
     HtmlParser* htmlParser_;
 
-    QString getTagContent(QList& tags, const QString& value);
-    QString getTitle(HtmlParser::TagInfoList& metaTags);
-    QString getDescription(HtmlParser::TagInfoList& metaTags);
-    QString getImage(HtmlParser::TagInfoList& metaTags);
+    QString getTagContent(const QList& tags, const QString& value);
+    QString getTitle(const QList& metaTags);
+    QString getDescription(const QList& metaTags);
+    QString getImage(const QList& metaTags);
 
     static const QRegularExpression newlineRe;
 };
diff --git a/tests/unittests/messageparser_unittest.cpp b/tests/unittests/messageparser_unittest.cpp
index 4cbe16d9..8f91834b 100644
--- a/tests/unittests/messageparser_unittest.cpp
+++ b/tests/unittests/messageparser_unittest.cpp
@@ -117,7 +117,6 @@ TEST_F(MessageParserFixture, EndOfLineCharactersAreParsedCorrectly)
     auto backgroundColor = QColor::fromRgb(0, 0, 255);
 
     QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
-    QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
 
     // Parse a message with a link.
     globalEnv.messageParser->parseMessage("msgId_03",
@@ -148,7 +147,6 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
     auto backgroundColor = QColor::fromRgb(0, 0, 255);
 
     QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
-    QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
 
     // Parse a message with a link.
     globalEnv.messageParser->parseMessage("msgId_04",
@@ -169,3 +167,41 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
               "

Text with

\n
code\n
\n"); } + +/*! + * WHEN We parse a text body with a youtube link. + * THEN PreviewEngine::parseLink should be called with the correct arguments. + */ +TEST_F(MessageParserFixture, YoutubeLinkIsParsedCorrectly) +{ + auto url = "https://www.youtube.com/watch?v=1234567890"; + auto msg = "blah blah " + QString(url) + " blah blah"; + + QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed); + QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady); + + // Parse a message with a link. + globalEnv.messageParser->parseMessage("msgId_05", + msg, + true, + QColor::fromRgb(0, 0, 255), + QColor::fromRgb(0, 0, 255)); + + // Wait for the messageParsed signal which should be emitted once. + messageParsedSpy.wait(); + EXPECT_EQ(messageParsedSpy.count(), 1); + + QList messageParserArguments = messageParsedSpy.takeFirst(); + EXPECT_TRUE(messageParserArguments.at(0).typeId() == qMetaTypeId()); + + // Wait for the linkInfoReady signal which should be emitted once. + linkInfoReadySpy.wait(); + EXPECT_EQ(linkInfoReadySpy.count(), 1); + + QList linkInfoReadyArguments = linkInfoReadySpy.takeFirst(); + EXPECT_TRUE(linkInfoReadyArguments.at(0).typeId() == qMetaTypeId()); + EXPECT_EQ(linkInfoReadyArguments.at(0).toString(), "msgId_05"); + EXPECT_TRUE(linkInfoReadyArguments.at(1).typeId() == qMetaTypeId()); + QVariantMap linkInfo = linkInfoReadyArguments.at(1).toMap(); + EXPECT_EQ(linkInfo["url"].toString(), url); +}