1
0
Fork 0
mirror of https://git.jami.net/savoirfairelinux/jami-client-qt.git synced 2025-08-04 14:55:43 +02:00

messageparser: replace regexes with tidy API

Depend on tidy API for attribute extraction rather than regexes.

1. htmlparser methods return nodes instead of pre-parsed strings
2. htmlparser provides some methods to extract text/attr from nodes

Gitlab: #1248
Change-Id: I367d703680938fb0b7c5055ac41e079c1322da30
This commit is contained in:
Andreas Traczyk 2023-07-06 18:55:14 -04:00 committed by Sébastien Blin
parent ec0feef74d
commit 61126cfa64
5 changed files with 150 additions and 78 deletions

View file

@ -39,6 +39,7 @@ public:
doc_ = tidyCreate(); doc_ = tidyCreate();
tidyOptSetBool(doc_, TidyQuiet, yes); tidyOptSetBool(doc_, TidyQuiet, yes);
tidyOptSetBool(doc_, TidyShowWarnings, no); tidyOptSetBool(doc_, TidyShowWarnings, no);
tidyOptSetInt(doc_, TidyUseCustomTags, TidyCustomEmpty);
} }
~HtmlParser() ~HtmlParser()
@ -51,46 +52,88 @@ public:
return tidyParseString(doc_, html.toLocal8Bit().data()) >= 0; return tidyParseString(doc_, html.toLocal8Bit().data()) >= 0;
} }
using TagInfoList = QMap<TidyTagId, QList<QString>>; using TagNodeList = QMap<TidyTagId, QList<TidyNode>>;
// A function that traverses the DOM tree and fills a QVariantMap with a list // A function that traverses the DOM tree and fills a QVariantMap with a list
// of the tags and their values. The result is structured as follows: // of the tags and their nodes. The result is structured as follows:
// {tagId1: ["tagValue1", "tagValue2", ...], // {tagId1: [tagNode1, tagNode2, ...],
// tagId: ["tagValue1", "tagValue2", ...], // tagId2: [tagNode3, tagNode4, ...],
// ... } // ... }
TagInfoList getTags(QList<TidyTagId> tags, int maxDepth = -1) TagNodeList getTagsNodes(const QList<TidyTagId>& tags, int maxDepth = -1)
{ {
TagInfoList result; TagNodeList result;
traverseNode( traverseNode(
tidyGetRoot(doc_), tidyGetRoot(doc_),
tags, tags,
[&result](const QString& value, TidyTagId tag) { result[tag].append(value); }, [&result](TidyNode node, TidyTagId tag) { result[tag].append(node); },
maxDepth); maxDepth);
return result; return result;
} }
QString getFirstTagValue(TidyTagId tag, int maxDepth = -1) // The same as the above function, only it returns the first node for a single tag.
TidyNode getFirstTagNode(TidyTagId tag, int maxDepth = -1)
{ {
QString result; TidyNode result = nullptr;
traverseNode( traverseNode(
tidyGetRoot(doc_), tidyGetRoot(doc_),
{tag}, {tag},
[&result](const QString& value, TidyTagId) { result = value; }, [&result](TidyNode node, TidyTagId) { result = node; },
maxDepth); maxDepth);
return result; return result;
} }
private: // Extract the text value from a node.
void traverseNode(TidyNode node, QString getNodeText(TidyNode node)
QList<TidyTagId> tags,
const std::function<void(const QString&, TidyTagId)>& cb,
int depth = -1)
{ {
TidyBuffer nodeValue = {}; TidyBuffer nodeValue = {};
if (!node || tidyNodeGetText(doc_, node, &nodeValue) != yes) {
return QString();
}
QString result = QString::fromUtf8((char*) nodeValue.bp, nodeValue.size);
tidyBufFree(&nodeValue);
return result;
}
// Extract the attribute value from a node.
QString getNodeAttr(TidyNode node, TidyAttrId attrId)
{
TidyAttr attr = tidyAttrGetById(node, attrId);
if (!attr) {
return QString();
}
const auto* attrValue = tidyAttrValue(attr);
if (!attrValue) {
return QString();
}
return QString::fromLocal8Bit(attrValue);
}
// Extract the inner HTML of a node.
QString getNodeInnerHtml(TidyNode node)
{
if (!node) {
return QString();
}
const auto* child = tidyGetChild(node);
return child ? getNodeText(child) : QString();
}
QString getTagInnerHtml(TidyTagId tag)
{
return getNodeInnerHtml(getFirstTagNode(tag));
}
private:
// NOLINTNEXTLINE(misc-no-recursion)
void traverseNode(TidyNode node,
const QList<TidyTagId>& tags,
const std::function<void(TidyNode, TidyTagId)>& cb,
int depth = -1)
{
for (auto tag : tags) { for (auto tag : tags) {
if (tidyNodeGetId(node) == tag && tidyNodeGetText(doc_, node, &nodeValue) == yes && cb) { if (tidyNodeGetId(node) == tag && cb) {
cb(QString::fromLocal8Bit(nodeValue.bp), tag); cb(node, tag);
if (depth != -1 && --depth == 0) { if (depth != -1 && --depth == 0) {
return; return;
} }

View file

@ -25,6 +25,18 @@
#include "md4c-html.h" #include "md4c-html.h"
namespace {
// A callback function that will be called by the md4c library (`md_html`) to output the HTML.
void
htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData)
{
QByteArray* array = static_cast<QByteArray*>(userData);
if (data_size > 0) {
array->append(data, int(data_size));
}
};
} // namespace
MessageParser::MessageParser(PreviewEngine* previewEngine, QObject* parent) MessageParser::MessageParser(PreviewEngine* previewEngine, QObject* parent)
: QObject(parent) : QObject(parent)
, previewEngine_(previewEngine) , previewEngine_(previewEngine)
@ -51,9 +63,9 @@ MessageParser::parseMessage(const QString& messageId,
// Now that we have the HTML, we can parse it to get a list of tags and their values. // Now that we have the HTML, we can parse it to get a list of tags and their values.
// We are only interested in the <a> and <pre> tags. // We are only interested in the <a> and <pre> tags.
htmlParser_->parseHtmlString(html); htmlParser_->parseHtmlString(html);
auto tagsMap = htmlParser_->getTags({TidyTag_A, TidyTag_DEL, TidyTag_PRE}); auto tagsMap = htmlParser_->getTagsNodes({TidyTag_A, TidyTag_DEL, TidyTag_PRE});
static QString styleTag("<style>%1</style>"); static const QString styleTag("<style>%1</style>");
QString style; QString style;
// Check for any <pre> tags. If there are any, we need to: // Check for any <pre> tags. If there are any, we need to:
@ -89,11 +101,9 @@ MessageParser::parseMessage(const QString& messageId,
// If the user has enabled link previews, then we need to generate the link preview. // If the user has enabled link previews, then we need to generate the link preview.
if (previewLinks) { if (previewLinks) {
// Get the first link in the message. // Get the first link in the message.
auto anchorTag = tagsMap[TidyTag_A].first(); auto href = htmlParser_->getNodeAttr(tagsMap[TidyTag_A].first(), TidyAttr_HREF);
static QRegularExpression hrefRegex("href=\"(.*?)\""); if (!href.isEmpty()) {
auto match = hrefRegex.match(anchorTag); Q_EMIT previewEngine_->parseLink(messageId, href);
if (match.hasMatch()) {
Q_EMIT previewEngine_->parseLink(messageId, match.captured(1));
} }
} }
@ -110,13 +120,13 @@ void
MessageParser::preprocessMarkdown(QString& markdown) MessageParser::preprocessMarkdown(QString& markdown)
{ {
// Match all instances of the linefeed character. // Match all instances of the linefeed character.
static QRegularExpression newlineRegex("\n"); static const QRegularExpression newlineRegex("\\r?\\n");
static const QString newline = " \n"; static const QString newline = " \n";
// Replace all instances of the linefeed character with 2 spaces + a linefeed character // Replace all instances of the linefeed character with 2 spaces + a linefeed character
// in order to force a line break in the HTML. // in order to force a line break in the HTML.
// Note: we should only do this for non-code fenced blocks. // Note: we should only do this for non-code fenced blocks.
static QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}"); static const QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}");
auto match = codeFenceRe.globalMatch(markdown); auto match = codeFenceRe.globalMatch(markdown);
// If there are no code blocks, then we can just replace all linefeeds with 2 spaces // If there are no code blocks, then we can just replace all linefeeds with 2 spaces
@ -132,7 +142,7 @@ MessageParser::preprocessMarkdown(QString& markdown)
enum BlockType { Text, Code }; enum BlockType { Text, Code };
QVector<QPair<BlockType, QString>> codeBlocks; QVector<QPair<BlockType, QString>> codeBlocks;
int start = 0; qsizetype start = 0;
while (match.hasNext()) { while (match.hasNext()) {
auto m = match.next(); auto m = match.next();
auto nonCodelength = m.capturedStart() - start; auto nonCodelength = m.capturedStart() - start;
@ -158,27 +168,16 @@ MessageParser::preprocessMarkdown(QString& markdown)
} }
} }
// A callback function that will be called by the md4c library (`md_html`) to output the HTML.
static void
htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData)
{
QByteArray* array = static_cast<QByteArray*>(userData);
if (data_size > 0) {
array->append(data, int(data_size));
}
};
QString QString
MessageParser::markdownToHtml(const char* markdown) MessageParser::markdownToHtml(const char* markdown)
{ {
static auto md_flags = MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_NOINDENTEDCODEBLOCKS static auto md_flags = MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_NOINDENTEDCODEBLOCKS
| MD_FLAG_TASKLISTS | MD_FLAG_STRIKETHROUGH | MD_FLAG_UNDERLINE; | MD_FLAG_TASKLISTS | MD_FLAG_STRIKETHROUGH | MD_FLAG_UNDERLINE;
size_t data_len = strlen(markdown); const size_t data_len = strlen(markdown);
if (data_len <= 0) { if (data_len <= 0) {
return QString(); return QString();
} else {
QByteArray array;
int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
return result == 0 ? QString::fromUtf8(array) : QString();
} }
QByteArray array;
const int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
return result == 0 ? QString::fromUtf8(array) : QString();
} }

View file

@ -19,15 +19,6 @@
#include <QRegularExpression> #include <QRegularExpression>
static QString
getInnerHtml(const QString& tag)
{
static const QRegularExpression re(">([^<]+)<");
const auto match = re.match(tag);
return match.hasMatch() ? match.captured(1) : QString {};
};
// Portable newline regex.
const QRegularExpression PreviewEngine::newlineRe("\\r?\\n"); const QRegularExpression PreviewEngine::newlineRe("\\r?\\n");
PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent) PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
@ -39,12 +30,11 @@ PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
} }
QString QString
PreviewEngine::getTagContent(QList<QString>& tags, const QString& value) PreviewEngine::getTagContent(const QList<QString>& tags, const QString& value)
{ {
Q_FOREACH (auto tag, tags) { Q_FOREACH (auto tag, tags) {
const QRegularExpression re("(property|name)=\"(og:|twitter:|)" + value const QRegularExpression re("(property|name)=\"(og:|twitter:|)" + value
+ "\".*?content=\"([^\"]+)\""); + "\".*?content=\"([^\"]+)\"");
const auto match = re.match(tag.remove(newlineRe)); const auto match = re.match(tag.remove(newlineRe));
if (match.hasMatch()) { if (match.hasMatch()) {
return match.captured(3); return match.captured(3);
@ -54,45 +44,44 @@ PreviewEngine::getTagContent(QList<QString>& tags, const QString& value)
} }
QString QString
PreviewEngine::getTitle(HtmlParser::TagInfoList& metaTags) PreviewEngine::getTitle(const QList<QString>& metaTags)
{ {
// Try with opengraph/twitter props // Try with opengraph/twitter props
QString title = getTagContent(metaTags[TidyTag_META], "title"); QString title = getTagContent(metaTags, "title");
if (title.isEmpty()) { // Try with title tag if (title.isEmpty()) { // Try with title tag
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_TITLE)); title = htmlParser_->getTagInnerHtml(TidyTag_TITLE);
} }
if (title.isEmpty()) { // Try with h1 tag if (title.isEmpty()) { // Try with h1 tag
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H1)); title = htmlParser_->getTagInnerHtml(TidyTag_H1);
} }
if (title.isEmpty()) { // Try with h2 tag if (title.isEmpty()) { // Try with h2 tag
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H2)); title = htmlParser_->getTagInnerHtml(TidyTag_H2);
} }
return title; return title;
} }
QString QString
PreviewEngine::getDescription(HtmlParser::TagInfoList& metaTags) PreviewEngine::getDescription(const QList<QString>& metaTags)
{ {
// Try with og/twitter props // Try with og/twitter props
QString d = getTagContent(metaTags[TidyTag_META], "description"); QString desc = getTagContent(metaTags, "description");
if (d.isEmpty()) { // Try with first paragraph if (desc.isEmpty()) { // Try with first paragraph
d = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_P)); desc = htmlParser_->getTagInnerHtml(TidyTag_P);
} }
return d; return desc;
} }
QString QString
PreviewEngine::getImage(HtmlParser::TagInfoList& metaTags) PreviewEngine::getImage(const QList<QString>& metaTags)
{ {
// Try with og/twitter props // Try with og/twitter props
QString image = getTagContent(metaTags[TidyTag_META], "image"); QString image = getTagContent(metaTags, "image");
if (image.isEmpty()) { // Try with href of link tag (rel="image_src") if (image.isEmpty()) { // Try with href of link tag (rel="image_src")
auto tags = htmlParser_->getTags({TidyTag_LINK}); auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_LINK});
Q_FOREACH (auto tag, tags[TidyTag_LINK]) { Q_FOREACH (auto tag, tagsNodes[TidyTag_LINK]) {
static const QRegularExpression re("rel=\"image_src\".*?href=\"([^\"]+)\""); QString href = htmlParser_->getNodeAttr(tag, TidyAttr_HREF);
const auto match = re.match(tag.remove(newlineRe)); if (!href.isEmpty()) {
if (match.hasMatch()) { return href;
return match.captured(1);
} }
} }
} }
@ -104,7 +93,12 @@ PreviewEngine::onParseLink(const QString& messageId, const QString& link)
{ {
sendGetRequest(QUrl(link), [this, messageId, link](const QByteArray& html) { sendGetRequest(QUrl(link), [this, messageId, link](const QByteArray& html) {
htmlParser_->parseHtmlString(html); htmlParser_->parseHtmlString(html);
auto metaTags = htmlParser_->getTags({TidyTag_META}); auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_META});
auto metaTagNodes = tagsNodes[TidyTag_META];
QList<QString> metaTags;
Q_FOREACH (auto tag, metaTagNodes) {
metaTags.append(htmlParser_->getNodeText(tag));
}
QString domain = QUrl(link).host(); QString domain = QUrl(link).host();
if (domain.isEmpty()) { if (domain.isEmpty()) {
domain = link; domain = link;

View file

@ -39,10 +39,10 @@ private:
// An instance of HtmlParser used to parse HTML. // An instance of HtmlParser used to parse HTML.
HtmlParser* htmlParser_; HtmlParser* htmlParser_;
QString getTagContent(QList<QString>& tags, const QString& value); QString getTagContent(const QList<QString>& tags, const QString& value);
QString getTitle(HtmlParser::TagInfoList& metaTags); QString getTitle(const QList<QString>& metaTags);
QString getDescription(HtmlParser::TagInfoList& metaTags); QString getDescription(const QList<QString>& metaTags);
QString getImage(HtmlParser::TagInfoList& metaTags); QString getImage(const QList<QString>& metaTags);
static const QRegularExpression newlineRe; static const QRegularExpression newlineRe;
}; };

View file

@ -117,7 +117,6 @@ TEST_F(MessageParserFixture, EndOfLineCharactersAreParsedCorrectly)
auto backgroundColor = QColor::fromRgb(0, 0, 255); auto backgroundColor = QColor::fromRgb(0, 0, 255);
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed); QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
// Parse a message with a link. // Parse a message with a link.
globalEnv.messageParser->parseMessage("msgId_03", globalEnv.messageParser->parseMessage("msgId_03",
@ -148,7 +147,6 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
auto backgroundColor = QColor::fromRgb(0, 0, 255); auto backgroundColor = QColor::fromRgb(0, 0, 255);
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed); QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
// Parse a message with a link. // Parse a message with a link.
globalEnv.messageParser->parseMessage("msgId_04", globalEnv.messageParser->parseMessage("msgId_04",
@ -169,3 +167,41 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
"<style>pre,code{background-color:#0000ff;color:#ffffff;white-space:pre-wrap;" "<style>pre,code{background-color:#0000ff;color:#ffffff;white-space:pre-wrap;"
"}</style><p>Text with</p>\n<pre><code>code\n</code></pre>\n"); "}</style><p>Text with</p>\n<pre><code>code\n</code></pre>\n");
} }
/*!
* WHEN We parse a text body with a youtube link.
* THEN PreviewEngine::parseLink should be called with the correct arguments.
*/
TEST_F(MessageParserFixture, YoutubeLinkIsParsedCorrectly)
{
auto url = "https://www.youtube.com/watch?v=1234567890";
auto msg = "blah blah " + QString(url) + " blah blah";
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
// Parse a message with a link.
globalEnv.messageParser->parseMessage("msgId_05",
msg,
true,
QColor::fromRgb(0, 0, 255),
QColor::fromRgb(0, 0, 255));
// Wait for the messageParsed signal which should be emitted once.
messageParsedSpy.wait();
EXPECT_EQ(messageParsedSpy.count(), 1);
QList<QVariant> messageParserArguments = messageParsedSpy.takeFirst();
EXPECT_TRUE(messageParserArguments.at(0).typeId() == qMetaTypeId<QString>());
// Wait for the linkInfoReady signal which should be emitted once.
linkInfoReadySpy.wait();
EXPECT_EQ(linkInfoReadySpy.count(), 1);
QList<QVariant> linkInfoReadyArguments = linkInfoReadySpy.takeFirst();
EXPECT_TRUE(linkInfoReadyArguments.at(0).typeId() == qMetaTypeId<QString>());
EXPECT_EQ(linkInfoReadyArguments.at(0).toString(), "msgId_05");
EXPECT_TRUE(linkInfoReadyArguments.at(1).typeId() == qMetaTypeId<QVariantMap>());
QVariantMap linkInfo = linkInfoReadyArguments.at(1).toMap();
EXPECT_EQ(linkInfo["url"].toString(), url);
}